mindcraft/tasks/analyse_results.py
Johnathan Walker cc51242527 feat: Enhanced task evaluation system with flexible agent support and rich outcome reporting
- Added new evaluation.py with dynamic agent configuration support
- Implemented comprehensive test suite (38 tests, 100% pass rate)
- Enhanced evaluation_script.py with improved error handling and logging
- Updated analysis tools for better outcome reporting and visualization
- Added extensive documentation including architecture guide and user manuals
- Maintained backward compatibility with existing task formats
- Improved performance and reliability for multi-agent evaluations

Key improvements:
- Flexible agent count configuration (1-N agents)
- Rich outcome data structures with detailed metrics
- Comprehensive error handling and recovery mechanisms
- Enhanced logging and debugging capabilities
- Complete test coverage for production readiness

Files added/modified:
- tasks/evaluation.py (new core evaluation engine)
- tasks/test_*.py (comprehensive test suite)
- docs/ (complete documentation suite)
- Updated analysis and visualization tools
2025-06-15 22:01:19 -04:00

252 lines
No EOL
11 KiB
Python

import boto3
import os
import json
import re
from botocore.exceptions import ClientError
import argparse
from tqdm import tqdm
from typing import List, Dict, Any
import pandas as pd
import logging
import concurrent.futures
# Set up basic logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
from tasks.evaluation import (
extract_task_outcome,
aggregate_results_to_dataframe,
)
# --- Constants and Setup ---
# Calculate project root directory to allow for absolute path resolution
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# Define a centralized output directory for all analysis results
analysis_output_dir = os.path.join(project_root, "experiments", "analysis_results")
# Ensure the output directory exists, creating it if necessary
os.makedirs(analysis_output_dir, exist_ok=True)
def download_s3_folders(bucket_name: str, s3_prefix: str, local_base_dir: str, max_workers: int = 10) -> List[str]:
"""
Downloads experiment folders and their contents from S3 concurrently.
This function uses a thread pool to parallelize the download of log files,
which can significantly speed up the process for large-scale experiments.
Args:
bucket_name (str): The name of the S3 bucket.
s3_prefix (str): The S3 prefix (folder path) where the experiments are stored.
local_base_dir (str): The local directory to download the folders into.
max_workers (int): The maximum number of concurrent download threads.
Returns:
List[str]: A list of local paths to the downloaded folders.
"""
s3_client = boto3.client('s3')
downloaded_folders = []
if not os.path.isabs(local_base_dir):
local_base_dir = os.path.join(project_root, local_base_dir)
def download_file(s3_key, local_path):
try:
s3_client.download_file(bucket_name, s3_key, local_path)
logging.debug(f"Successfully downloaded {s3_key} to {local_path}")
except ClientError as e:
logging.error(f"Failed to download {s3_key}: {e}")
try:
paginator = s3_client.get_paginator('list_objects_v2')
pages = paginator.paginate(Bucket=bucket_name, Prefix=s3_prefix, Delimiter='/')
s3_folder_prefixes = []
for page in pages:
if 'CommonPrefixes' in page:
s3_folder_prefixes.extend([p['Prefix'] for p in page['CommonPrefixes']])
if not s3_folder_prefixes:
logging.warning(f"No folders found under s3://{bucket_name}/{s3_prefix}")
return []
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_key = {}
for s3_folder_prefix in tqdm(s3_folder_prefixes, desc="Queueing downloads"):
folder_name = s3_folder_prefix.rstrip('/').split('/')[-1]
local_folder_path = os.path.join(local_base_dir, folder_name)
os.makedirs(local_folder_path, exist_ok=True)
downloaded_folders.append(local_folder_path)
# List objects and submit download tasks
obj_pages = paginator.paginate(Bucket=bucket_name, Prefix=s3_folder_prefix)
for page in obj_pages:
if 'Contents' in page:
for obj in page['Contents']:
s3_key = obj['Key']
if not s3_key.endswith('/'): # Don't download "folders"
local_file_path = os.path.join(local_folder_path, os.path.basename(s3_key))
future = executor.submit(download_file, s3_key, local_file_path)
future_to_key[future] = s3_key
for future in tqdm(concurrent.futures.as_completed(future_to_key), total=len(future_to_key), desc="Downloading files"):
s3_key = future_to_key[future]
try:
future.result()
except Exception as exc:
logging.error(f'{s3_key} generated an exception: {exc}')
except ClientError as e:
logging.error(f"Error accessing S3: {e}")
return []
return downloaded_folders
def aggregate_results(local_folders: List[str], task_definitions: Dict[str, Any]) -> pd.DataFrame:
"""
Aggregates experiment results from a list of local folders into a DataFrame.
This function serves as the core analysis engine, iterating through each task
folder, extracting outcomes, and compiling them into a single, comprehensive
DataFrame for further analysis.
Args:
local_folders (List[str]): A list of paths to the task run folders.
task_definitions (Dict[str, Any]): A dictionary of all task definitions,
keyed by task_id.
Returns:
pd.DataFrame: A DataFrame containing the detailed evaluation results.
"""
task_outcomes = []
for folder_path in tqdm(local_folders, desc="Analyzing task folders"):
task_id = os.path.basename(folder_path.strip(os.sep))
task_def = task_definitions.get(task_id)
if not task_def:
logging.warning(f"No task definition found for task_id '{task_id}'. Skipping folder '{folder_path}'.")
continue
if 'task_id' not in task_def:
task_def['task_id'] = task_id
try:
# Use the core evaluation function
outcome = extract_task_outcome(folder_path, task_def)
# The model name is often part of the folder structure, let's try to extract it
# This is an example, and might need to be adapted based on the actual folder structure
try:
# e.g. experiments/my_exp_date/claude-3-5-sonnet-latest/task_1
model_name = folder_path.split(os.sep)[-2]
outcome.model_name = model_name
except IndexError:
outcome.model_name = "unknown"
task_outcomes.append(outcome)
except Exception as e:
logging.error(f"Error processing folder {folder_path}: {e}")
# Convert the list of dictionaries to a DataFrame
return aggregate_results_to_dataframe(task_outcomes)
def get_immediate_subdirectories(a_dir: str) -> List[str]:
"""
Gets a list of immediate subdirectories within a given directory.
Args:
a_dir (str): The directory to scan.
Returns:
List[str]: A list of full paths to the immediate subdirectories.
"""
# Ensure a_dir is an absolute path for reliable processing
if not os.path.isabs(a_dir):
a_dir = os.path.join(project_root, a_dir)
if not os.path.isdir(a_dir):
logging.warning(f"Directory not found: {a_dir}")
return []
return [os.path.join(a_dir, name) for name in os.listdir(a_dir)
if os.path.isdir(os.path.join(a_dir, name))]
def main() -> None:
"""
Main function to run the analysis pipeline.
Parses command-line arguments, downloads data from S3 if requested,
analyzes the experiment logs, and saves the results to a CSV file.
"""
parser = argparse.ArgumentParser(description="Analyze Mindcraft experiment results.")
parser.add_argument('--s3_download', action="store_true", help='Download folders from S3 before analysis.')
parser.add_argument('--aws_bucket_name', default="mindcraft-experiments", type=str, help='The name of the AWS S3 bucket.')
parser.add_argument('--s3_folder_prefix', default="", type=str, help='The S3 prefix (folder) to download from.')
parser.add_argument('--local_dir', default="experiments", type=str, help='Local directory with experiment results (relative to project root).')
parser.add_argument('--task_file_path', required=True, type=str, help='Path to the task definition JSON file.')
args = parser.parse_args()
# --- Step 1: Determine Folders to Analyze ---
local_dir_abs = args.local_dir
if not os.path.isabs(local_dir_abs):
local_dir_abs = os.path.join(project_root, local_dir_abs)
if args.s3_download:
if not args.s3_folder_prefix:
logging.error("S3 folder prefix (--s3_folder_prefix) is required for S3 download.")
return
logging.info(f"Downloading folders from s3://{args.aws_bucket_name}/{args.s3_folder_prefix} to {local_dir_abs}...")
folders_to_analyze = download_s3_folders(args.aws_bucket_name, args.s3_folder_prefix, local_dir_abs)
else:
logging.info(f"Analyzing local folders in: {local_dir_abs}")
folders_to_analyze = get_immediate_subdirectories(local_dir_abs)
if not folders_to_analyze:
logging.warning("No folders found to analyze. Exiting.")
return
# --- Step 2: Load Task Definitions ---
try:
with open(args.task_file_path, 'r') as f:
task_definitions = json.load(f)
except (FileNotFoundError, json.JSONDecodeError) as e:
logging.error(f"Could not read or parse task file at '{args.task_file_path}': {e}")
return
# --- Step 3: Aggregate Results into a DataFrame ---
results_df = aggregate_results(folders_to_analyze, task_definitions)
if results_df.empty:
logging.warning("Analysis generated no results. Exiting.")
return
# --- Step 4: Perform High-Level Analysis and Print Summary ---
logging.info("\n--- Overall Results ---")
if 'overall_is_successful' in results_df.columns:
overall_success_rate = results_df['overall_is_successful'].mean()
logging.info(f"Total Tasks Analyzed: {len(results_df)}")
logging.info(f"Overall Success Rate: {overall_success_rate:.2%}")
logging.info("\n--- Analysis by Task Type ---")
if 'task_type' in results_df.columns:
success_by_type = results_df.groupby('task_type')['overall_is_successful'].agg(['mean', 'count'])
success_by_type.rename(columns={'mean': 'success_rate'}, inplace=True)
logging.info("\n" + success_by_type.to_string())
logging.info("\n--- Analysis by Model Name ---")
if 'model_name' in results_df.columns:
success_by_model = results_df.groupby('model_name')['overall_is_successful'].agg(['mean', 'count'])
success_by_model.rename(columns={'mean': 'success_rate'}, inplace=True)
logging.info("\n" + success_by_model.to_string())
# --- Step 5: Save Results to CSV ---
if args.s3_folder_prefix:
output_filename_base = args.s3_folder_prefix.strip('/').replace('/', '_')
else:
output_filename_base = os.path.basename(os.path.normpath(local_dir_abs))
results_csv_path = os.path.join(analysis_output_dir, f"{output_filename_base}_analysis_results.csv")
results_df.to_csv(results_csv_path, index=False)
logging.info(f"\nDetailed analysis results saved to: {results_csv_path}")
if __name__ == "__main__":
main()