a script to analyze results and a small merge clean up

This commit is contained in:
Isadora White 2025-03-05 15:20:16 -08:00
parent 7e7f893cf3
commit 5da24ecf79
5 changed files with 325 additions and 31 deletions

258
analyse_results.py Normal file
View file

@ -0,0 +1,258 @@
import boto3
import os
import json
import re
from botocore.exceptions import ClientError
import json
import argparse
from tqdm import tqdm
import glob
def download_s3_folders(bucket_name, s3_prefix, local_base_dir):
"""
Downloads groups of folders from S3 based on the next level of prefixes.
Args:
bucket_name (str): Name of the S3 bucket.
s3_prefix (str): Prefix where the folders are located (e.g., 'my-experiments/').
local_base_dir (str): Local directory to download the folders to.
Returns:
list: List of downloaded local folder paths.
"""
s3_client = boto3.client('s3')
downloaded_folders = []
try:
# List objects with the prefix, delimited by '/' to find sub-prefixes (folders)
response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=s3_prefix, Delimiter='/')
if 'CommonPrefixes' not in response:
print(f"No folders found under s3://{bucket_name}/{s3_prefix}")
return downloaded_folders
s3_folder_prefixes = [prefix['Prefix'] for prefix in response['CommonPrefixes']]
subfolder = s3_prefix.split('/')[-2]
for s3_folder_prefix in tqdm(s3_folder_prefixes):
folder_name = s3_folder_prefix.split('/')[-2] # Extract folder name
local_folder_path = os.path.join(local_base_dir, subfolder, folder_name)
os.makedirs(local_folder_path, exist_ok=True)
downloaded_folders.append(local_folder_path)
# Download files within the folder
objects_in_folder = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder_prefix)
if 'Contents' in objects_in_folder:
for obj in objects_in_folder['Contents']:
s3_key = obj['Key']
if s3_key.endswith(('.json')): # Only download json files
local_file_path = os.path.join(local_folder_path, os.path.basename(s3_key))
try:
s3_client.download_file(bucket_name, s3_key, local_file_path)
except Exception as e:
print(f"Error downloading {s3_key}: {e}")
else:
print(f"No files found in {s3_folder_prefix}")
except ClientError as e:
print(f"Error accessing S3: {e}")
return []
return downloaded_folders
def analyze_json_file(file_path):
"""
Analyzes a single JSON file to extract the task outcome.
Args:
file_path (str): Path to the JSON file.
Returns:
str or None: The task outcome string if found, otherwise None.
"""
try:
with open(file_path, 'r') as f:
data = json.load(f)
if 'turns' in data and isinstance(data['turns'], list):
for turn in reversed(data['turns']): # Check turns from the end
if turn.get('role') == 'system' and isinstance(turn.get('content'), str):
if "Task successful ended with code : 2" in turn['content']:
return True
return False
except FileNotFoundError:
print(f"Error: File not found: {file_path}")
return None
except json.JSONDecodeError:
print(f"Error: Invalid JSON format in: {file_path}")
return None
except Exception as e:
print(f"An unexpected error occurred while processing {file_path}: {e}")
return None
def extract_result(folder_path):
folder_name = os.path.basename(folder_path)
json_files = glob.glob(os.path.join(folder_path, "*.json"))
assert len(json_files) == 2, f"Expected 2 json files in {folder_name}, found {len(json_files)}"
if not json_files:
print(f"No JSON files found in {folder_name}")
return None
else:
outcome = False
for json_file in json_files:
outcome = analyze_json_file(json_file)
if outcome:
return True
return False
def is_base(folder_path):
return "full_plan" in folder_path and "depth_0" in folder_path and "missing" not in folder_path
def base_without_plan(folder_path):
return "no_plan" in folder_path and "depth_0" in folder_path and "missing" in folder_path
def aggregate_results(local_folders):
"""
Aggregates the analysis results for each folder.
Args:
local_folders (list): List of local folder paths containing the JSON files.
Returns:
dict: A dictionary where keys are folder names and values are the aggregated outcomes.
"""
aggregated_data = {}
total = 0
successful = 0
base_successful = 0
base_total = 0
base_no_plan_successful = 0
base_no_plan_total = 0
missing_successful = 0
missing_total = 0
full_plan_successful = 0
full_plan_total = 0
partial_plan_successful = 0
partial_plan_total = 0
no_plan_successful = 0
no_plan_total = 0
high_depth_successful = 0
high_depth_total = 0
for folder_path in tqdm(local_folders):
folder_name = os.path.basename(folder_path)
try:
total += 1
result = extract_result(folder_path)
success = int(extract_result(folder_path))
successful += success
if "missing" in folder_path:
missing_successful += success
missing_total += 1
if is_base(folder_path):
base_successful += success
base_total += 1
if base_without_plan(folder_path):
base_no_plan_successful += success
base_no_plan_total += 1
if "full_plan" in folder_path and not is_base(folder_path):
full_plan_successful += success
full_plan_total += 1
if "partial_plan" in folder_path and not is_base(folder_path):
partial_plan_successful += success
partial_plan_total += 1
if "no_plan" in folder_path and not is_base(folder_path):
no_plan_successful += success
no_plan_total += 1
if "depth_1" in folder_path or "depth_2" in folder_path and not is_base(folder_path):
high_depth_successful += success
high_depth_total += 1
except Exception as e:
print(f"Error processing {folder_name}: {e}")
return {
"total": total,
"successful": successful,
"success_rate": successful / total if total > 0 else 0,
"base_total": base_total,
"base_successful": base_successful,
"base_success_rate": base_successful / base_total if base_total > 0 else 0,
"base_no_plan_total": base_no_plan_total,
"base_no_plan_successful": base_no_plan_successful,
"base_no_plan_success_rate": base_no_plan_successful / base_no_plan_total if base_no_plan_total > 0 else 0,
"missing_total": missing_total,
"missing_successful": missing_successful,
"missing_success_rate": missing_successful / missing_total if missing_total > 0 else 0,
"full_plan_total": full_plan_total,
"full_plan_successful": full_plan_successful,
"full_plan_success_rate": full_plan_successful / full_plan_total if full_plan_total > 0 else 0,
"partial_plan_total": partial_plan_total,
"partial_plan_successful": partial_plan_successful,
"partial_plan_success_rate": partial_plan_successful / partial_plan_total if partial_plan_total > 0 else 0,
"no_plan_total": no_plan_total,
"no_plan_successful": no_plan_successful,
"no_plan_success_rate": no_plan_successful / no_plan_total if no_plan_total > 0 else 0,
"high_depth_total": high_depth_total,
"high_depth_successful": high_depth_successful,
"high_depth_success_rate": high_depth_successful / high_depth_total if high_depth_total > 0 else 0
}
def get_immediate_subdirectories(a_dir):
return [os.path.join(a_dir, name) for name in os.listdir(a_dir)
if os.path.isdir(os.path.join(a_dir, name))]
# --- Main Execution ---
if __name__ == "__main__":
# 1. Download folders from AWS
parser = argparse.ArgumentParser()
parser.add_argument('--s3_download', default=False, type=bool, help='Download folders from S3')
parser.add_argument('--aws_bucket_name', default="mindcraft" , type=str, help='AWS bucket name')
parser.add_argument('--s3_folder_prefix', default="experiments/4o_craft_better_tasks_03-02_07-15/", type=str, help='S3 folder prefix')
parser.add_argument('--local_download_dir', default="results/4o_craft_better_tasks_03-02_07-15/", type=str, help='Local download directory')
args = parser.parse_args()
AWS_BUCKET_NAME = args.aws_bucket_name
S3_FOLDER_PREFIX = args.s3_folder_prefix
LOCAL_DOWNLOAD_DIR = args.local_download_dir
if (args.s3_download):
print(f"Downloading folders from s3://{args.aws_bucket_name}/{args.s3_folder_prefix} to {args.local_download_dir}...")
folders = download_s3_folders(args.aws_bucket_name, args.s3_folder_prefix, args.local_download_dir)
else:
folders = get_immediate_subdirectories(args.local_download_dir)
print(folders)
results = aggregate_results(folders)
print(results)
# Save results to a file
with open(args.local_download_dir + "/results.txt", "w") as file:
file.write("Results\n")
for key, value in results.items():
file.write(f"{key}: {value}\n")
print("Results saved to results.txt")
# if not downloaded_local_folders:
# print("No folders downloaded. Exiting.")
# exit()
# print("\n--- Analyzing downloaded files ---")
# # 2. & 3. Analyze files and aggregate results
# results = aggregate_results(downloaded_local_folders)
# print("\n--- Aggregated Results ---")
# for folder, outcome in results.items():
# print(f"Folder: {folder} -> {outcome}")
# Optional: Clean up downloaded files
# import shutil
# shutil.rmtree(LOCAL_DOWNLOAD_DIR)
# print(f"\nCleaned up {LOCAL_DOWNLOAD_DIR}")

View file

@ -0,0 +1,25 @@
Results
total: 823
successful: 196
success_rate: 0.23815309842041313
base_total: 69
base_successful: 20
base_success_rate: 0.2898550724637681
base_no_plan_total: 27
base_no_plan_successful: 10
base_no_plan_success_rate: 0.37037037037037035
missing_total: 375
missing_successful: 72
missing_success_rate: 0.192
full_plan_total: 196
full_plan_successful: 39
full_plan_success_rate: 0.1989795918367347
partial_plan_total: 282
partial_plan_successful: 62
partial_plan_success_rate: 0.2198581560283688
no_plan_total: 276
no_plan_successful: 75
no_plan_success_rate: 0.2717391304347826
high_depth_total: 505
high_depth_successful: 90
high_depth_success_rate: 0.1782178217821782

View file

@ -0,0 +1,21 @@
Folder: total -> 304
Folder: successful -> 77
Folder: success_rate -> 0.2532894736842105
Folder: base_total -> 39
Folder: base_successful -> 12
Folder: base_success_rate -> 0.3076923076923077
Folder: missing_total -> 102
Folder: missing_successful -> 19
Folder: missing_success_rate -> 0.18627450980392157
Folder: full_plan_total -> 57
Folder: full_plan_successful -> 9
Folder: full_plan_success_rate -> 0.15789473684210525
Folder: partial_plan_total -> 108
Folder: partial_plan_successful -> 26
Folder: partial_plan_success_rate -> 0.24074074074074073
Folder: no_plan_total -> 99
Folder: no_plan_successful -> 30
Folder: no_plan_success_rate -> 0.30303030303030304
Folder: high_depth_total -> 165
Folder: high_depth_successful -> 29
Folder: high_depth_success_rate -> 0.17575757575757575

21
results/clauderesults.txt Normal file
View file

@ -0,0 +1,21 @@
Folder: total -> 601
Folder: successful -> 210
Folder: success_rate -> 0.34941763727121466
Folder: base_total -> 69
Folder: base_successful -> 31
Folder: base_success_rate -> 0.4492753623188406
Folder: missing_total -> 232
Folder: missing_successful -> 87
Folder: missing_success_rate -> 0.375
Folder: full_plan_total -> 126
Folder: full_plan_successful -> 47
Folder: full_plan_success_rate -> 0.373015873015873
Folder: partial_plan_total -> 201
Folder: partial_plan_successful -> 62
Folder: partial_plan_success_rate -> 0.30845771144278605
Folder: no_plan_total -> 205
Folder: no_plan_successful -> 70
Folder: no_plan_success_rate -> 0.34146341463414637
Folder: high_depth_total -> 292
Folder: high_depth_successful -> 55
Folder: high_depth_success_rate -> 0.18835616438356165

View file

@ -64,37 +64,6 @@ export class Agent {
save_data = this.history.load();
}
if (this.task) {
this.task.loadTask(task_path, task_id);
this.taskTimeout = this.task.timeout || 300;
this.taskStartTime = Date.now();
if (this.task.type === 'harvest' || this.task.type === 'techtree') {
// todo: this validator doesn't exist?
// this.validator = new TechTreeHarvestValidator(this.task, this.bot);
}
// this.validator = new TechTreeHarvestValidator(this.task, this.bot);
} else {
console.log('called without task')
this.task = null;
this.taskTimeout = null;
this.validator = null;
}
// handle blocked actions
if (this.task && "blocked_actions" in this.task) {
if ("agent_number" in this.task && this.task.agent_number > 1) {
this.blocked_actions = this.task.blocked_actions[this.name];
console.log(`Blocked actions for ${this.name}:`, this.blocked_actions);
} else {
this.blocked_actions = this.task.blocked_actions;
console.log(`Blocked actions:`, this.blocked_actions);
}
}
console.log("Is validated:", this.validator && this.validator.validate());
this.bot.on('login', () => {
console.log(this.name, 'logged in!');