From 1a71d80e41f34174038e925aa3ada383fb5f2bc1 Mon Sep 17 00:00:00 2001 From: Pradyun Ramadorai Date: Sat, 14 Mar 2026 21:58:26 +0000 Subject: [PATCH] [Bugfix] Fix harmony streaming tool call crash and argument splitting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two fixes for Chat Completions streaming with harmony models (gpt_oss): 1. Tool call arguments split across indices: With speculative decoding (Eagle), multi-token batches can span channel boundaries. extract_harmony_streaming_delta reads harmony_parser.messages to compute tool call indices, so it must be called immediately after each token is processed — not after the entire batch. Otherwise the parser state is fully advanced and base_index is wrong for early tokens, causing argument fragments to land in separate tool calls. Fix: Interleave token processing with delta extraction — process one token through harmony_parser.process(), immediately call extract_harmony_streaming_delta for that token, then process the next token. Merge the per-token deltas into a single DeltaMessage. 2. IndexError crash in autocomplete logic: The unstreamed tool arg tokens autocomplete logic accesses tool_parser.prev_tool_call_arr and tool_parser.streamed_args_for_tool, but harmony models never populate these arrays. Skip this block for harmony models. Signed-off-by: Pradyun Ramadorai Co-authored-by: Claude --- .../openai/chat_completion/serving.py | 70 ++++++++++++++----- 1 file changed, 52 insertions(+), 18 deletions(-) diff --git a/vllm/entrypoints/openai/chat_completion/serving.py b/vllm/entrypoints/openai/chat_completion/serving.py index bf8beb9b97ab..47ec3e414e62 100644 --- a/vllm/entrypoints/openai/chat_completion/serving.py +++ b/vllm/entrypoints/openai/chat_completion/serving.py @@ -708,24 +708,64 @@ async def chat_completion_stream_generator( harmony_parser = harmony_parsers[i] prev_recipient = harmony_parser.current_recipient - # Track accumulated content per token with their state + # Process each token and extract its delta immediately, + # before processing the next token. This is critical + # because extract_harmony_streaming_delta reads + # harmony_parser.messages to compute tool call indices. + # Processing all tokens first would advance the parser + # state past the point where early tokens' indices are + # correct, causing tool call argument fragments to be + # split across different tool call indices. token_states: list[TokenState] = [] + combined_delta: DeltaMessage | None = None for token_id in output.token_ids: harmony_parser.process(token_id) token_delta = harmony_parser.last_content_delta or "" - token_states.append( - TokenState( - harmony_parser.current_channel, - harmony_parser.current_recipient, - token_delta, + ts = TokenState( + harmony_parser.current_channel, + harmony_parser.current_recipient, + token_delta, + ) + token_states.append(ts) + + # Extract delta for THIS token immediately + per_tok_delta, per_tok_tools = ( + extract_harmony_streaming_delta( + harmony_parser=harmony_parser, + token_states=[ts], + prev_recipient=prev_recipient, + include_reasoning=request.include_reasoning, ) ) + if per_tok_tools: + harmony_tools_streamed[i] = True + if ts.recipient: + prev_recipient = ts.recipient + if per_tok_delta is not None: + if combined_delta is None: + combined_delta = per_tok_delta + else: + if per_tok_delta.content: + combined_delta.content = ( + combined_delta.content or "" + ) + per_tok_delta.content + if per_tok_delta.reasoning: + combined_delta.reasoning = ( + combined_delta.reasoning or "" + ) + per_tok_delta.reasoning + if per_tok_delta.tool_calls: + if combined_delta.tool_calls is None: + combined_delta.tool_calls = [] + combined_delta.tool_calls.extend( + per_tok_delta.tool_calls + ) + delta_text = "".join(delta for _, _, delta in token_states) cur_channel = harmony_parser.current_channel - # handle the case where several tokens where generated at once - # including the final token, leading to a delta in the text - # but the current channel to be empty (start state) + # handle the case where several tokens where generated + # at once including the final token, leading to a delta + # in the text but the current channel to be empty if not cur_channel and delta_text: cur_channel = "final" else: @@ -757,15 +797,8 @@ async def chat_completion_stream_generator( current_token_ids = as_list(output.token_ids) if self.use_harmony: - delta_message, tools_streamed_flag = ( - extract_harmony_streaming_delta( - harmony_parser=harmony_parser, - token_states=token_states, - prev_recipient=prev_recipient, - include_reasoning=request.include_reasoning, - ) - ) - harmony_tools_streamed[i] |= tools_streamed_flag + # Delta already computed in the per-token loop above + delta_message = combined_delta # handle streaming deltas for tools with named tool_choice elif tool_choice_function_name: # When encountering think end id in prompt_token_ids @@ -1108,6 +1141,7 @@ async def chat_completion_stream_generator( delta_message, output ) and tool_parser + and not self.use_harmony ): latest_delta_len = 0 if (