From 2ebc8b055b9a20beda30e9cb3594b36015b6c723 Mon Sep 17 00:00:00 2001 From: Peter Tomko Date: Sat, 27 Jun 2026 22:30:55 +0200 Subject: [PATCH] fix(eval): fix search_tool correctness always scoring 0% MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _args_match checked emit_widget (renamed to user_requested_search in the tool schema) and limit (optional, server-side default). Both mismatched on every real model call, so tool_correctness was always False regardless of whether the model used the right keywords and object types. Fix: evaluate only keywords (case-insensitive) and object_types — the two fields that actually determine whether the search was semantically correct. --- .../gooddata_eval/core/evaluators/search_tool.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/packages/gooddata-eval/src/gooddata_eval/core/evaluators/search_tool.py b/packages/gooddata-eval/src/gooddata_eval/core/evaluators/search_tool.py index 8d7fa1f62..32fe135a8 100644 --- a/packages/gooddata-eval/src/gooddata_eval/core/evaluators/search_tool.py +++ b/packages/gooddata-eval/src/gooddata_eval/core/evaluators/search_tool.py @@ -6,13 +6,14 @@ def _args_match(actual_args: dict, expected_args: dict) -> bool: - if sorted(actual_args.get("keywords") or []) != sorted(expected_args.get("keywords") or []): + # Only keywords and object_types determine semantic correctness. + # limit is optional with a server-side default; emit_widget was renamed to + # user_requested_search in the tool schema — neither affects search quality. + actual_kw = sorted(k.lower() for k in (actual_args.get("keywords") or [])) + expected_kw = sorted(k.lower() for k in (expected_args.get("keywords") or [])) + if actual_kw != expected_kw: return False - if sorted(actual_args.get("object_types") or []) != sorted(expected_args.get("object_types") or []): - return False - if actual_args.get("limit") != expected_args.get("limit"): - return False - return actual_args.get("emit_widget") == expected_args.get("emit_widget") + return sorted(actual_args.get("object_types") or []) == sorted(expected_args.get("object_types") or []) class SearchToolEvaluator: