mindcraft/tasks/analyze_cooking_tasks.py
Johnathan Walker cc51242527 feat: Enhanced task evaluation system with flexible agent support and rich outcome reporting
- Added new evaluation.py with dynamic agent configuration support
- Implemented comprehensive test suite (38 tests, 100% pass rate)
- Enhanced evaluation_script.py with improved error handling and logging
- Updated analysis tools for better outcome reporting and visualization
- Added extensive documentation including architecture guide and user manuals
- Maintained backward compatibility with existing task formats
- Improved performance and reliability for multi-agent evaluations

Key improvements:
- Flexible agent count configuration (1-N agents)
- Rich outcome data structures with detailed metrics
- Comprehensive error handling and recovery mechanisms
- Enhanced logging and debugging capabilities
- Complete test coverage for production readiness

Files added/modified:
- tasks/evaluation.py (new core evaluation engine)
- tasks/test_*.py (comprehensive test suite)
- docs/ (complete documentation suite)
- Updated analysis and visualization tools
2025-06-15 22:01:19 -04:00

258 lines
No EOL
10 KiB
Python

import os
import json
import re
import argparse
import pandas as pd
from prettytable import PrettyTable
from tqdm import tqdm
import logging
from typing import List, Dict, Any
# Import from our new centralized evaluation module
from tasks.evaluation import extract_task_outcome, aggregate_results_to_dataframe
# Set up basic logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# --- Constants and Setup ---
# Calculate project root directory for reliable path resolution
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# Define a centralized output directory for analysis results
analysis_output_dir = os.path.join(project_root, "experiments", "analysis_results")
# Ensure the output directory exists
os.makedirs(analysis_output_dir, exist_ok=True)
def get_immediate_subdirectories(a_dir: str) -> List[str]:
"""
Returns a list of full paths to immediate subdirectories.
Args:
a_dir (str): The directory to scan.
Returns:
List[str]: A list of absolute paths to the subdirectories.
"""
if not os.path.isabs(a_dir):
a_dir = os.path.join(project_root, a_dir)
if not os.path.isdir(a_dir):
return []
return [f.path for f in os.scandir(a_dir) if f.is_dir()]
def enrich_dataframe_with_cooking_metrics(df: pd.DataFrame) -> pd.DataFrame:
"""
Enriches the DataFrame with cooking-specific metrics by parsing the 'task_id'.
Warning: This function relies on a specific naming convention for task_id.
A more robust long-term solution is to store these metrics directly in the
task definition's metadata.
Args:
df (pd.DataFrame): The DataFrame to enrich.
Returns:
pd.DataFrame: The enriched DataFrame with new 'num_blocked_agents' and
'target_items' columns.
"""
if df.empty:
return df
logging.warning("The 'enrich_dataframe_with_cooking_metrics' function relies on parsing task_id. "
"This is fragile and should be replaced by storing metrics directly in the task definition.")
def get_blocked_agents_from_task_id(task_id: str) -> int:
"""Extracts the number of blocked agents from the task_id string."""
if not isinstance(task_id, str):
return 0
match = re.search(r'blocked_access_([0-9_]+)$', task_id)
if match:
return len(match.group(1).split('_'))
return 0
df['num_blocked_agents'] = df['task_id'].apply(get_blocked_agents_from_task_id)
def get_target_items_from_task_id(task_id: str) -> List[str]:
"""Extracts the list of target cooking items from the task_id string."""
if not isinstance(task_id, str):
return []
clean_name = re.sub(r'^multiagent_cooking_', '', task_id)
clean_name = re.sub(r'_blocked_access_[0-9_]+$', '', clean_name)
items = [
match.group(2).rstrip('_')
for match in re.finditer(r'([0-9]+)_([a-zA-Z_]+)', clean_name)
]
return items
df['target_items'] = df['task_id'].apply(get_target_items_from_task_id)
return df
def print_blocked_agents_summary(df: pd.DataFrame) -> None:
"""
Prints a summary table of success rates by the number of blocked agents.
Args:
df (pd.DataFrame): The DataFrame containing the analysis results.
"""
logging.info("\n--- Analysis by Number of Blocked Agents ---")
if df.empty or 'num_blocked_agents' not in df.columns or df['num_blocked_agents'].sum() == 0:
logging.warning("No data on blocked agents available for analysis.")
return
summary = df.groupby(['model_name', 'num_blocked_agents'])['overall_is_successful'].agg(['sum', 'count'])
summary['success_rate'] = (summary['sum'] / summary['count']) * 100
try:
pivot = summary.reset_index().pivot(
index='num_blocked_agents',
columns='model_name',
values=['success_rate', 'sum', 'count']
)
except KeyError:
logging.error("Could not create pivot table for blocked agents. Check DataFrame content.")
return
table = PrettyTable()
model_names = sorted(df['model_name'].unique())
table.field_names = ["Blocked Agents"] + [f"{model} (Rate | Success/Total)" for model in model_names]
for num_blocked in sorted(df['num_blocked_agents'].unique()):
row = [f"{num_blocked} agent(s)"]
for model in model_names:
try:
rate = pivot.loc[num_blocked, ('success_rate', model)]
successes = pivot.loc[num_blocked, ('sum', model)]
total = pivot.loc[num_blocked, ('count', model)]
row.append(f"{rate:.2f}% | {int(successes)}/{int(total)}")
except KeyError:
row.append("N/A")
table.add_row(row)
logging.info("\n" + table.get_string())
def print_cooking_item_summary(df: pd.DataFrame) -> None:
"""
Prints a summary table of success rates by target cooking item.
Args:
df (pd.DataFrame): The DataFrame containing the analysis results.
"""
logging.info("\n--- Analysis by Cooking Item ---")
if df.empty or 'target_items' not in df.columns:
logging.warning("No data on cooking items available for analysis.")
return
df_items = df.explode('target_items')
if df_items.empty:
logging.warning("No cooking items found to analyze.")
return
summary = df_items.groupby(['model_name', 'target_items'])['overall_is_successful'].agg(['sum', 'count'])
summary['success_rate'] = (summary['sum'] / summary['count']) * 100
try:
pivot = summary.reset_index().pivot(
index='target_items',
columns='model_name',
values=['success_rate', 'sum', 'count']
)
except KeyError:
logging.error("Could not create pivot table for cooking items. Check DataFrame content.")
return
table = PrettyTable()
model_names = sorted(df['model_name'].unique())
table.field_names = ["Cooking Item"] + [f"{model} (Rate | Success/Total)" for model in model_names]
for item in sorted(df_items['target_items'].unique()):
row = [item]
for model in model_names:
try:
rate = pivot.loc[item, ('success_rate', model)]
successes = pivot.loc[item, ('sum', model)]
total = pivot.loc[item, ('count', model)]
row.append(f"{rate:.2f}% | {int(successes)}/{int(total)}")
except KeyError:
row.append("N/A")
table.add_row(row)
logging.info("\n" + table.get_string())
def main() -> None:
"""
Main function to run the cooking task analysis pipeline.
Parses arguments, finds relevant cooking experiment folders, runs the
evaluation, enriches the data with cooking-specific metrics, and prints
summary tables.
"""
parser = argparse.ArgumentParser(description='Analyze cooking task experiment results.')
parser.add_argument('--log_dir', type=str, default='experiments',
help='Directory containing experiment folders (relative to project root).')
parser.add_argument('--task_file_path', required=True, type=str,
help='Path to the task definition JSON file for cooking tasks.')
args = parser.parse_args()
# --- Step 1: Find Cooking-Specific Experiment Folders ---
log_dir_abs = args.log_dir
if not os.path.isabs(log_dir_abs):
log_dir_abs = os.path.join(project_root, log_dir_abs)
all_exp_folders = get_immediate_subdirectories(log_dir_abs)
# Filter for folders that are explicitly for cooking tasks
cooking_folders = [f for f in all_exp_folders if 'cooking' in os.path.basename(f).lower()]
if not cooking_folders:
logging.warning(f"No cooking experiment folders found in '{log_dir_abs}'. Exiting.")
return
logging.info(f"Found {len(cooking_folders)} cooking experiment folders to analyze.")
# --- Step 2: Load Task Definitions ---
try:
with open(args.task_file_path, 'r') as f:
task_definitions = json.load(f)
except (FileNotFoundError, json.JSONDecodeError) as e:
logging.error(f"Error reading or parsing task file '{args.task_file_path}': {e}")
return
# --- Step 3: Run Core Evaluation and Aggregation ---
task_outcomes = []
for folder in tqdm(cooking_folders, desc="Analyzing cooking tasks"):
task_id = os.path.basename(folder.strip(os.sep))
task_def = task_definitions.get(task_id)
if not task_def:
logging.warning(f"No task definition found for '{task_id}'. Skipping.")
continue
if 'task_id' not in task_def:
task_def['task_id'] = task_id
outcome = extract_task_outcome(folder, task_def)
try:
model_name = os.path.basename(os.path.dirname(folder))
outcome.model_name = model_name
except IndexError:
pass
task_outcomes.append(outcome)
df = aggregate_results_to_dataframe(task_outcomes)
if df.empty:
logging.warning("Analysis did not produce any results.")
return
# --- Step 4: Enrich with Cooking Metrics and Analyze ---
df_enriched = enrich_dataframe_with_cooking_metrics(df)
print_blocked_agents_summary(df_enriched)
print_cooking_item_summary(df_enriched)
# --- Step 5: Save Results ---
output_filename = f"{os.path.basename(os.path.normpath(log_dir_abs))}_cooking_analysis.csv"
output_path = os.path.join(analysis_output_dir, output_filename)
df_enriched.to_csv(output_path, index=False)
logging.info(f"\nDetailed cooking task analysis saved to: {output_path}")
if __name__ == "__main__":
main()