diff --git a/evals/registry/data/agent_tool_abstention/samples.jsonl b/evals/registry/data/agent_tool_abstention/samples.jsonl new file mode 100644 index 0000000000..dd9e445877 --- /dev/null +++ b/evals/registry/data/agent_tool_abstention/samples.jsonl @@ -0,0 +1,12 @@ +{"input": [{"role": "system", "content": "You are an AI agent that selects the most appropriate tool to handle a user request. Given a list of available tools and a user request, output ONLY the name of the single best tool to invoke. If NONE of the available tools can handle the request, output exactly the literal string NO_TOOL on a single line. Output the tool name (or NO_TOOL) on a single line with no quotes, no JSON, no explanation, and no surrounding whitespace."}, {"role": "user", "content": "Available tools:\n- send_email: Send an email to a recipient\n- create_calendar_event: Create an event on the user's calendar\n- search_contacts: Search the address book by name or email\n\nUser request: What's the weather in Tokyo today?\n\nWhich tool?"}], "ideal": "NO_TOOL"} +{"input": [{"role": "system", "content": "You are an AI agent that selects the most appropriate tool to handle a user request. Given a list of available tools and a user request, output ONLY the name of the single best tool to invoke. If NONE of the available tools can handle the request, output exactly the literal string NO_TOOL on a single line. Output the tool name (or NO_TOOL) on a single line with no quotes, no JSON, no explanation, and no surrounding whitespace."}, {"role": "user", "content": "Available tools:\n- read_file: Read the contents of a file at a given path\n- write_file: Write content to a file at a given path\n- list_directory: List files at a given path\n\nUser request: Translate this French sentence into English: 'Bonjour, comment ca va?'\n\nWhich tool?"}], "ideal": "NO_TOOL"} +{"input": [{"role": "system", "content": "You are an AI agent that selects the most appropriate tool to handle a user request. Given a list of available tools and a user request, output ONLY the name of the single best tool to invoke. If NONE of the available tools can handle the request, output exactly the literal string NO_TOOL on a single line. Output the tool name (or NO_TOOL) on a single line with no quotes, no JSON, no explanation, and no surrounding whitespace."}, {"role": "user", "content": "Available tools:\n- query_database: Run a SQL SELECT query against the analytics database\n- list_tables: List all tables in the analytics database\n- describe_schema: Return the schema of a specific table\n\nUser request: Find every customer who made a purchase over $500 last month.\n\nWhich tool?"}], "ideal": "query_database"} +{"input": [{"role": "system", "content": "You are an AI agent that selects the most appropriate tool to handle a user request. Given a list of available tools and a user request, output ONLY the name of the single best tool to invoke. If NONE of the available tools can handle the request, output exactly the literal string NO_TOOL on a single line. Output the tool name (or NO_TOOL) on a single line with no quotes, no JSON, no explanation, and no surrounding whitespace."}, {"role": "user", "content": "Available tools:\n- create_pull_request: Open a new pull request on GitHub\n- list_open_issues: List all open issues on a GitHub repository\n- comment_on_issue: Add a comment to an existing GitHub issue\n\nUser request: Can you review my code and tell me if the algorithm is correct?\n\nWhich tool?"}], "ideal": "NO_TOOL"} +{"input": [{"role": "system", "content": "You are an AI agent that selects the most appropriate tool to handle a user request. Given a list of available tools and a user request, output ONLY the name of the single best tool to invoke. If NONE of the available tools can handle the request, output exactly the literal string NO_TOOL on a single line. Output the tool name (or NO_TOOL) on a single line with no quotes, no JSON, no explanation, and no surrounding whitespace."}, {"role": "user", "content": "Available tools:\n- web_search: Search the public web for a query string\n- fetch_url: Fetch the contents of a specific URL\n- summarize_text: Summarize a block of text\n\nUser request: I want to know what is on the page at https://example.com/news.\n\nWhich tool?"}], "ideal": "fetch_url"} +{"input": [{"role": "system", "content": "You are an AI agent that selects the most appropriate tool to handle a user request. Given a list of available tools and a user request, output ONLY the name of the single best tool to invoke. If NONE of the available tools can handle the request, output exactly the literal string NO_TOOL on a single line. Output the tool name (or NO_TOOL) on a single line with no quotes, no JSON, no explanation, and no surrounding whitespace."}, {"role": "user", "content": "Available tools:\n- create_invoice: Create a new invoice for a customer\n- send_invoice: Send an existing invoice to a customer by email\n- list_unpaid_invoices: List invoices that have not yet been paid\n\nUser request: What is the capital of Australia?\n\nWhich tool?"}], "ideal": "NO_TOOL"} +{"input": [{"role": "system", "content": "You are an AI agent that selects the most appropriate tool to handle a user request. Given a list of available tools and a user request, output ONLY the name of the single best tool to invoke. If NONE of the available tools can handle the request, output exactly the literal string NO_TOOL on a single line. Output the tool name (or NO_TOOL) on a single line with no quotes, no JSON, no explanation, and no surrounding whitespace."}, {"role": "user", "content": "Available tools:\n- restart_service: Restart a named service on the production cluster\n- view_logs: Read recent log lines for a named service\n- list_deployments: List all deployments in a namespace\n\nUser request: Please write me a poem about Kubernetes.\n\nWhich tool?"}], "ideal": "NO_TOOL"} +{"input": [{"role": "system", "content": "You are an AI agent that selects the most appropriate tool to handle a user request. Given a list of available tools and a user request, output ONLY the name of the single best tool to invoke. If NONE of the available tools can handle the request, output exactly the literal string NO_TOOL on a single line. Output the tool name (or NO_TOOL) on a single line with no quotes, no JSON, no explanation, and no surrounding whitespace."}, {"role": "user", "content": "Available tools:\n- search_flights: Search available flights between two airports on a given date\n- book_flight: Book a specific flight by flight number and date\n- cancel_booking: Cancel an existing booking by booking reference\n\nUser request: How long does it usually take to fly from SFO to JFK?\n\nWhich tool?"}], "ideal": "NO_TOOL"} +{"input": [{"role": "system", "content": "You are an AI agent that selects the most appropriate tool to handle a user request. Given a list of available tools and a user request, output ONLY the name of the single best tool to invoke. If NONE of the available tools can handle the request, output exactly the literal string NO_TOOL on a single line. Output the tool name (or NO_TOOL) on a single line with no quotes, no JSON, no explanation, and no surrounding whitespace."}, {"role": "user", "content": "Available tools:\n- create_branch: Create a new git branch off of main\n- commit_changes: Commit the staged changes with a message\n- push_branch: Push the current branch to the remote\n\nUser request: Save my edits to git with the message 'docs: clarify install steps'.\n\nWhich tool?"}], "ideal": "commit_changes"} +{"input": [{"role": "system", "content": "You are an AI agent that selects the most appropriate tool to handle a user request. Given a list of available tools and a user request, output ONLY the name of the single best tool to invoke. If NONE of the available tools can handle the request, output exactly the literal string NO_TOOL on a single line. Output the tool name (or NO_TOOL) on a single line with no quotes, no JSON, no explanation, and no surrounding whitespace."}, {"role": "user", "content": "Available tools:\n- get_weather: Get the current weather for a city\n- get_forecast: Get the multi-day weather forecast for a city\n- list_saved_cities: List the user's saved cities\n\nUser request: Convert 50 US dollars to Japanese yen for me.\n\nWhich tool?"}], "ideal": "NO_TOOL"} +{"input": [{"role": "system", "content": "You are an AI agent that selects the most appropriate tool to handle a user request. Given a list of available tools and a user request, output ONLY the name of the single best tool to invoke. If NONE of the available tools can handle the request, output exactly the literal string NO_TOOL on a single line. Output the tool name (or NO_TOOL) on a single line with no quotes, no JSON, no explanation, and no surrounding whitespace."}, {"role": "user", "content": "Available tools:\n- create_user: Create a new user account\n- delete_user: Delete a user account\n- reset_password: Send a password reset link to a user\n- list_users: List all users\n\nUser request: Jordan from finance forgot their login. Help them get back in without giving them a fresh account.\n\nWhich tool?"}], "ideal": "reset_password"} +{"input": [{"role": "system", "content": "You are an AI agent that selects the most appropriate tool to handle a user request. Given a list of available tools and a user request, output ONLY the name of the single best tool to invoke. If NONE of the available tools can handle the request, output exactly the literal string NO_TOOL on a single line. Output the tool name (or NO_TOOL) on a single line with no quotes, no JSON, no explanation, and no surrounding whitespace."}, {"role": "user", "content": "Available tools:\n- transcribe_audio: Convert an audio file to text\n- summarize_text: Summarize a block of text\n- translate_text: Translate text from one language to another\n- detect_language: Identify the language of a piece of text\n\nUser request: Tell me a joke about programmers.\n\nWhich tool?"}], "ideal": "NO_TOOL"} diff --git a/evals/registry/evals/agent-tool-abstention.yaml b/evals/registry/evals/agent-tool-abstention.yaml new file mode 100644 index 0000000000..ff2fc02633 --- /dev/null +++ b/evals/registry/evals/agent-tool-abstention.yaml @@ -0,0 +1,9 @@ +agent-tool-abstention: + id: agent-tool-abstention.dev.v0 + description: Tests an LLM's ability to abstain from calling a tool when none of the available tools fit the user's request, while still selecting the correct tool when one does fit. Each sample presents 3 to 4 thematically related tools alongside a user request; the model must output either the canonical tool name or the literal string NO_TOOL. Targets the well-known failure mode where tool-using agents confabulate a tool call rather than declining. Of 13 samples, 9 have ideal=NO_TOOL (out-of-scope requests) and 4 have a real tool answer (in-scope, used as positive controls so a model that always answers NO_TOOL does not get a perfect score). Companion to agent-tool-routing. + disclaimer: This eval is hand-curated with 13 samples. It uses the Match template (no custom code) and complements agent-tool-routing by isolating the abstention behavior. Extendable by adding more samples.jsonl rows. + metrics: [accuracy] +agent-tool-abstention.dev.v0: + class: evals.elsuite.basic.match:Match + args: + samples_jsonl: agent_tool_abstention/samples.jsonl