From 18eca2f5d96e4c986f642ffe716e04c5d85f9ddf Mon Sep 17 00:00:00 2001
From: Johnathan Walker <thewalkers060292@gmail.com>
Date: Sun, 15 Jun 2025 23:21:01 -0400
Subject: [PATCH] fix: Resolve API naming inconsistency in analyse_results
 module

- Re-export enhanced function as 'aggregate_results' for backward compatibility
- Users can now import aggregate_results and get the enhanced functionality
- Updated architecture documentation to reflect the corrected API
- Maintains intuitive API while providing enhanced model extraction features
---
 docs/evaluation_architecture.md |  4 ++--
 tasks/analyse_results.py        | 13 ++++++++-----
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/docs/evaluation_architecture.md b/docs/evaluation_architecture.md
index f3e0422..5a35d94 100644
--- a/docs/evaluation_architecture.md
+++ b/docs/evaluation_architecture.md
@@ -39,7 +39,7 @@ graph TD
     end
 
     A -- "Calls" --> E
-    B -- "Calls" --> E
+    B -- "Calls" --> F
     C -- "Calls" --> E
 
     E -- "Iterates over agent logs, calls" --> D
@@ -155,7 +155,7 @@ def aggregate_results_to_dataframe(task_outcomes: List[Dict[str, Any]]) -> pd.Da
     *   After the loop, it will call `evaluation.aggregate_results_to_dataframe()` to get the final DataFrame.
     *   All analysis (e.g., calculating overall success rate) will be done using the resulting DataFrame.
 3.  **Refactor `tasks/analyse_results.py`:**
-    *   This script will follow the same refactoring pattern as `evaluation_script.py`.
+    *   It calls the `aggregate_results` function which is an enhanced version of `aggregate_results` from `evaluation.py` that adds model name extraction.
     *   The complex, name-based categorization (`is_base`, `base_without_plan`) will be entirely replaced by simple Pandas `groupby()` operations on the DataFrame's columns (e.g., `df.groupby('task_type').success_rate.mean()`).
 4.  **Refactor `tasks/analyze_cooking_tasks.py`:**
     *   This script will also be refactored to use the new `evaluation` module.
diff --git a/tasks/analyse_results.py b/tasks/analyse_results.py
index bf67295..ba84d35 100644
--- a/tasks/analyse_results.py
+++ b/tasks/analyse_results.py
@@ -13,9 +13,7 @@ import concurrent.futures
 # Set up basic logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
-from tasks.evaluation import (
-    aggregate_results,
-)
+from tasks.evaluation import aggregate_results as original_aggregate_results
 
 # --- Constants and Setup ---
 # Calculate project root directory to allow for absolute path resolution
@@ -115,7 +113,7 @@ def analyze_results_with_model_extraction(local_folders: List[str], task_definit
         pd.DataFrame: A DataFrame containing the detailed evaluation results with model names.
     """
     # Use the centralized function with progress bar enabled
-    results_df = aggregate_results(local_folders, task_definitions, use_tqdm=True)
+    results_df = original_aggregate_results(local_folders, task_definitions, use_tqdm=True)
     
     # Extract model names from folder paths if possible
     if not results_df.empty and 'task_id' in results_df.columns:
@@ -139,6 +137,11 @@ def analyze_results_with_model_extraction(local_folders: List[str], task_definit
     
     return results_df
 
+
+# Re-export the enhanced function under the name `aggregate_results`
+aggregate_results = analyze_results_with_model_extraction
+
+
 def get_immediate_subdirectories(a_dir: str) -> List[str]:
     """
     Gets a list of immediate subdirectories within a given directory.
@@ -203,7 +206,7 @@ def main() -> None:
         return
 
     # --- Step 3: Aggregate Results into a DataFrame ---
-    results_df = analyze_results_with_model_extraction(folders_to_analyze, task_definitions)
+    results_df = aggregate_results(folders_to_analyze, task_definitions)
 
     if results_df.empty:
         logging.warning("Analysis generated no results. Exiting.")