fix: Resolve API naming inconsistency in analyse_results module

- Re-export enhanced function as 'aggregate_results' for backward compatibility - Users can now import aggregate_results and get the enhanced functionality - Updated architecture documentation to reflect the corrected API - Maintains intuitive API while providing enhanced model extraction features
2025-07-26 09:55:26 +02:00 · 2025-06-15 23:21:01 -04:00 · 2025-06-15 23:21:01 -04:00 · 18eca2f5d9
commit 18eca2f5d9
parent f7947ec3c2
2 changed files with 10 additions and 7 deletions
--- a/docs/evaluation_architecture.md
+++ b/docs/evaluation_architecture.md
@ -39,7 +39,7 @@ graph TD
    end

    A -- "Calls" --> E
-    B -- "Calls" --> E
+    B -- "Calls" --> F
    C -- "Calls" --> E

    E -- "Iterates over agent logs, calls" --> D
@ -155,7 +155,7 @@ def aggregate_results_to_dataframe(task_outcomes: List[Dict[str, Any]]) -> pd.Da
    *   After the loop, it will call `evaluation.aggregate_results_to_dataframe()` to get the final DataFrame.
    *   All analysis (e.g., calculating overall success rate) will be done using the resulting DataFrame.
 3.  **Refactor `tasks/analyse_results.py`:**
-    *   This script will follow the same refactoring pattern as `evaluation_script.py`.
+    *   It calls the `aggregate_results` function which is an enhanced version of `aggregate_results` from `evaluation.py` that adds model name extraction.
    *   The complex, name-based categorization (`is_base`, `base_without_plan`) will be entirely replaced by simple Pandas `groupby()` operations on the DataFrame's columns (e.g., `df.groupby('task_type').success_rate.mean()`).
 4.  **Refactor `tasks/analyze_cooking_tasks.py`:**
    *   This script will also be refactored to use the new `evaluation` module.
--- a/tasks/analyse_results.py
+++ b/tasks/analyse_results.py
@ -13,9 +13,7 @@ import concurrent.futures
 # Set up basic logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

-from tasks.evaluation import (
-    aggregate_results,
-)
+from tasks.evaluation import aggregate_results as original_aggregate_results

 # --- Constants and Setup ---
 # Calculate project root directory to allow for absolute path resolution
@ -115,7 +113,7 @@ def analyze_results_with_model_extraction(local_folders: List[str], task_definit
        pd.DataFrame: A DataFrame containing the detailed evaluation results with model names.
    """
    # Use the centralized function with progress bar enabled
-    results_df = aggregate_results(local_folders, task_definitions, use_tqdm=True)
+    results_df = original_aggregate_results(local_folders, task_definitions, use_tqdm=True)
    
    # Extract model names from folder paths if possible
    if not results_df.empty and 'task_id' in results_df.columns:
@ -139,6 +137,11 @@ def analyze_results_with_model_extraction(local_folders: List[str], task_definit
    
    return results_df

+
+# Re-export the enhanced function under the name `aggregate_results`
+aggregate_results = analyze_results_with_model_extraction
+
+
 def get_immediate_subdirectories(a_dir: str) -> List[str]:
    """
    Gets a list of immediate subdirectories within a given directory.
@ -203,7 +206,7 @@ def main() -> None:
        return

    # --- Step 3: Aggregate Results into a DataFrame ---
-    results_df = analyze_results_with_model_extraction(folders_to_analyze, task_definitions)
+    results_df = aggregate_results(folders_to_analyze, task_definitions)

    if results_df.empty:
        logging.warning("Analysis generated no results. Exiting.")