mindcraft/tasks/analyze_cooking_tasks.py

import os
import json
import re
import argparse
import pandas as pd
from prettytable import PrettyTable
from tqdm import tqdm
import logging
from typing import List, Dict, Any

# Import from our new centralized evaluation module
from tasks.evaluation import extract_task_outcome, aggregate_results_to_dataframe

# Set up basic logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# --- Constants and Setup ---
# Calculate project root directory for reliable path resolution
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# Define a centralized output directory for analysis results
analysis_output_dir = os.path.join(project_root, "experiments", "analysis_results")
# Ensure the output directory exists
os.makedirs(analysis_output_dir, exist_ok=True)

def get_immediate_subdirectories(a_dir: str) -> List[str]:
    """
    Returns a list of full paths to immediate subdirectories.

    Args:
        a_dir (str): The directory to scan.

    Returns:
        List[str]: A list of absolute paths to the subdirectories.
    """
    if not os.path.isabs(a_dir):
        a_dir = os.path.join(project_root, a_dir)
    if not os.path.isdir(a_dir):
        return []
    return [f.path for f in os.scandir(a_dir) if f.is_dir()]

def enrich_dataframe_with_cooking_metrics(df: pd.DataFrame) -> pd.DataFrame:
    """
    Enriches the DataFrame with cooking-specific metrics by parsing the 'task_id'.

    Warning: This function relies on a specific naming convention for task_id.
    A more robust long-term solution is to store these metrics directly in the
    task definition's metadata.

    Args:
        df (pd.DataFrame): The DataFrame to enrich.

    Returns:
        pd.DataFrame: The enriched DataFrame with new 'num_blocked_agents' and
                      'target_items' columns.
    """
    if df.empty:
        return df

    logging.warning("The 'enrich_dataframe_with_cooking_metrics' function relies on parsing task_id. "
                    "This is fragile and should be replaced by storing metrics directly in the task definition.")

    def get_blocked_agents_from_task_id(task_id: str) -> int:
        """Extracts the number of blocked agents from the task_id string."""
        if not isinstance(task_id, str):
            return 0
        match = re.search(r'blocked_access_([0-9_]+)$', task_id)
        if match:
            return len(match.group(1).split('_'))
        return 0

    df['num_blocked_agents'] = df['task_id'].apply(get_blocked_agents_from_task_id)

    def get_target_items_from_task_id(task_id: str) -> List[str]:
        """Extracts the list of target cooking items from the task_id string."""
        if not isinstance(task_id, str):
            return []
        clean_name = re.sub(r'^multiagent_cooking_', '', task_id)
        clean_name = re.sub(r'_blocked_access_[0-9_]+$', '', clean_name)
        items = [
            match.group(2).rstrip('_')
            for match in re.finditer(r'([0-9]+)_([a-zA-Z_]+)', clean_name)
        ]
        return items

    df['target_items'] = df['task_id'].apply(get_target_items_from_task_id)
    return df

def print_blocked_agents_summary(df: pd.DataFrame) -> None:
    """
    Prints a summary table of success rates by the number of blocked agents.

    Args:
        df (pd.DataFrame): The DataFrame containing the analysis results.
    """
    logging.info("\n--- Analysis by Number of Blocked Agents ---")
    if df.empty or 'num_blocked_agents' not in df.columns or df['num_blocked_agents'].sum() == 0:
        logging.warning("No data on blocked agents available for analysis.")
        return

    summary = df.groupby(['model_name', 'num_blocked_agents'])['overall_is_successful'].agg(['sum', 'count'])
    summary['success_rate'] = (summary['sum'] / summary['count']) * 100

    try:
        pivot = summary.reset_index().pivot(
            index='num_blocked_agents',
            columns='model_name',
            values=['success_rate', 'sum', 'count']
        )
    except KeyError:
        logging.error("Could not create pivot table for blocked agents. Check DataFrame content.")
        return

    table = PrettyTable()
    model_names = sorted(df['model_name'].unique())
    table.field_names = ["Blocked Agents"] + [f"{model} (Rate | Success/Total)" for model in model_names]

    for num_blocked in sorted(df['num_blocked_agents'].unique()):
        row = [f"{num_blocked} agent(s)"]
        for model in model_names:
            try:
                rate = pivot.loc[num_blocked, ('success_rate', model)]
                successes = pivot.loc[num_blocked, ('sum', model)]
                total = pivot.loc[num_blocked, ('count', model)]
                row.append(f"{rate:.2f}% | {int(successes)}/{int(total)}")
            except KeyError:
                row.append("N/A")
        table.add_row(row)

    logging.info("\n" + table.get_string())

def print_cooking_item_summary(df: pd.DataFrame) -> None:
    """
    Prints a summary table of success rates by target cooking item.

    Args:
        df (pd.DataFrame): The DataFrame containing the analysis results.
    """
    logging.info("\n--- Analysis by Cooking Item ---")
    if df.empty or 'target_items' not in df.columns:
        logging.warning("No data on cooking items available for analysis.")
        return

    df_items = df.explode('target_items')
    if df_items.empty:
        logging.warning("No cooking items found to analyze.")
        return

    summary = df_items.groupby(['model_name', 'target_items'])['overall_is_successful'].agg(['sum', 'count'])
    summary['success_rate'] = (summary['sum'] / summary['count']) * 100

    try:
        pivot = summary.reset_index().pivot(
            index='target_items',
            columns='model_name',
            values=['success_rate', 'sum', 'count']
        )
    except KeyError:
        logging.error("Could not create pivot table for cooking items. Check DataFrame content.")
        return

    table = PrettyTable()
    model_names = sorted(df['model_name'].unique())
    table.field_names = ["Cooking Item"] + [f"{model} (Rate | Success/Total)" for model in model_names]

    for item in sorted(df_items['target_items'].unique()):
        row = [item]
        for model in model_names:
            try:
                rate = pivot.loc[item, ('success_rate', model)]
                successes = pivot.loc[item, ('sum', model)]
                total = pivot.loc[item, ('count', model)]
                row.append(f"{rate:.2f}% | {int(successes)}/{int(total)}")
            except KeyError:
                row.append("N/A")
        table.add_row(row)

    logging.info("\n" + table.get_string())

def main() -> None:
    """
    Main function to run the cooking task analysis pipeline.

    Parses arguments, finds relevant cooking experiment folders, runs the
    evaluation, enriches the data with cooking-specific metrics, and prints
    summary tables.
    """
    parser = argparse.ArgumentParser(description='Analyze cooking task experiment results.')
    parser.add_argument('--log_dir', type=str, default='experiments',
                        help='Directory containing experiment folders (relative to project root).')
    parser.add_argument('--task_file_path', required=True, type=str,
                        help='Path to the task definition JSON file for cooking tasks.')
    args = parser.parse_args()

    # --- Step 1: Find Cooking-Specific Experiment Folders ---
    log_dir_abs = args.log_dir
    if not os.path.isabs(log_dir_abs):
        log_dir_abs = os.path.join(project_root, log_dir_abs)

    all_exp_folders = get_immediate_subdirectories(log_dir_abs)
    # Filter for folders that are explicitly for cooking tasks
    cooking_folders = [f for f in all_exp_folders if 'cooking' in os.path.basename(f).lower()]

    if not cooking_folders:
        logging.warning(f"No cooking experiment folders found in '{log_dir_abs}'. Exiting.")
        return

    logging.info(f"Found {len(cooking_folders)} cooking experiment folders to analyze.")

    # --- Step 2: Load Task Definitions ---
    try:
        with open(args.task_file_path, 'r') as f:
            task_definitions = json.load(f)
    except (FileNotFoundError, json.JSONDecodeError) as e:
        logging.error(f"Error reading or parsing task file '{args.task_file_path}': {e}")
        return

    # --- Step 3: Run Core Evaluation and Aggregation ---
    task_outcomes = []
    for folder in tqdm(cooking_folders, desc="Analyzing cooking tasks"):
        task_id = os.path.basename(folder.strip(os.sep))
        task_def = task_definitions.get(task_id)
        if not task_def:
            logging.warning(f"No task definition found for '{task_id}'. Skipping.")
            continue

        if 'task_id' not in task_def:
            task_def['task_id'] = task_id

        outcome = extract_task_outcome(folder, task_def)

        try:
            model_name = os.path.basename(os.path.dirname(folder))
            outcome.model_name = model_name
        except IndexError:
            pass

        task_outcomes.append(outcome)

    df = aggregate_results_to_dataframe(task_outcomes)

    if df.empty:
        logging.warning("Analysis did not produce any results.")
        return

    # --- Step 4: Enrich with Cooking Metrics and Analyze ---
    df_enriched = enrich_dataframe_with_cooking_metrics(df)

    print_blocked_agents_summary(df_enriched)
    print_cooking_item_summary(df_enriched)

    # --- Step 5: Save Results ---
    output_filename = f"{os.path.basename(os.path.normpath(log_dir_abs))}_cooking_analysis.csv"
    output_path = os.path.join(analysis_output_dir, output_filename)
    df_enriched.to_csv(output_path, index=False)
    logging.info(f"\nDetailed cooking task analysis saved to: {output_path}")

if __name__ == "__main__":
    main()