From 18eca2f5d96e4c986f642ffe716e04c5d85f9ddf Mon Sep 17 00:00:00 2001 From: Johnathan Walker Date: Sun, 15 Jun 2025 23:21:01 -0400 Subject: [PATCH] fix: Resolve API naming inconsistency in analyse_results module - Re-export enhanced function as 'aggregate_results' for backward compatibility - Users can now import aggregate_results and get the enhanced functionality - Updated architecture documentation to reflect the corrected API - Maintains intuitive API while providing enhanced model extraction features --- docs/evaluation_architecture.md | 4 ++-- tasks/analyse_results.py | 13 ++++++++----- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/docs/evaluation_architecture.md b/docs/evaluation_architecture.md index f3e0422..5a35d94 100644 --- a/docs/evaluation_architecture.md +++ b/docs/evaluation_architecture.md @@ -39,7 +39,7 @@ graph TD end A -- "Calls" --> E - B -- "Calls" --> E + B -- "Calls" --> F C -- "Calls" --> E E -- "Iterates over agent logs, calls" --> D @@ -155,7 +155,7 @@ def aggregate_results_to_dataframe(task_outcomes: List[Dict[str, Any]]) -> pd.Da * After the loop, it will call `evaluation.aggregate_results_to_dataframe()` to get the final DataFrame. * All analysis (e.g., calculating overall success rate) will be done using the resulting DataFrame. 3. **Refactor `tasks/analyse_results.py`:** - * This script will follow the same refactoring pattern as `evaluation_script.py`. + * It calls the `aggregate_results` function which is an enhanced version of `aggregate_results` from `evaluation.py` that adds model name extraction. * The complex, name-based categorization (`is_base`, `base_without_plan`) will be entirely replaced by simple Pandas `groupby()` operations on the DataFrame's columns (e.g., `df.groupby('task_type').success_rate.mean()`). 4. **Refactor `tasks/analyze_cooking_tasks.py`:** * This script will also be refactored to use the new `evaluation` module. diff --git a/tasks/analyse_results.py b/tasks/analyse_results.py index bf67295..ba84d35 100644 --- a/tasks/analyse_results.py +++ b/tasks/analyse_results.py @@ -13,9 +13,7 @@ import concurrent.futures # Set up basic logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') -from tasks.evaluation import ( - aggregate_results, -) +from tasks.evaluation import aggregate_results as original_aggregate_results # --- Constants and Setup --- # Calculate project root directory to allow for absolute path resolution @@ -115,7 +113,7 @@ def analyze_results_with_model_extraction(local_folders: List[str], task_definit pd.DataFrame: A DataFrame containing the detailed evaluation results with model names. """ # Use the centralized function with progress bar enabled - results_df = aggregate_results(local_folders, task_definitions, use_tqdm=True) + results_df = original_aggregate_results(local_folders, task_definitions, use_tqdm=True) # Extract model names from folder paths if possible if not results_df.empty and 'task_id' in results_df.columns: @@ -139,6 +137,11 @@ def analyze_results_with_model_extraction(local_folders: List[str], task_definit return results_df + +# Re-export the enhanced function under the name `aggregate_results` +aggregate_results = analyze_results_with_model_extraction + + def get_immediate_subdirectories(a_dir: str) -> List[str]: """ Gets a list of immediate subdirectories within a given directory. @@ -203,7 +206,7 @@ def main() -> None: return # --- Step 3: Aggregate Results into a DataFrame --- - results_df = analyze_results_with_model_extraction(folders_to_analyze, task_definitions) + results_df = aggregate_results(folders_to_analyze, task_definitions) if results_df.empty: logging.warning("Analysis generated no results. Exiting.")