new train, test, dev tasks and new analysis files

This commit is contained in:
Isadora White 2025-03-16 17:55:05 -07:00
parent 125aa73d6c
commit 1ccba3a4b5
28 changed files with 81428 additions and 4 deletions

View file

@ -0,0 +1,39 @@
import boto3
import os
import json
import re
from botocore.exceptions import ClientError
import json
import argparse
from tqdm import tqdm
import glob
def analyze_json_file(file_path):
"""
Analyzes a single JSON file to extract the task outcome.
Args:
file_path (str): Path to the JSON file.
Returns:
str or None: The task outcome string if found, otherwise None.
"""
try:
with open(file_path, 'r') as f:
data = json.load(f)
if 'turns' in data and isinstance(data['turns'], list):
for turn in reversed(data['turns']): # Check turns from the end
if turn.get('role') == 'system' and isinstance(turn.get('content'), str):
if "Task successful ended with code : 2" in turn['content'] or "Task ended in score: 1" in turn["content"]:
return True
return False
except FileNotFoundError:
print(f"Error: File not found: {file_path}")
return None
except json.JSONDecodeError:
print(f"Error: Invalid JSON format in: {file_path}")
return None
except Exception as e:
print(f"An unexpected error occurred while processing {file_path}: {e}")
return None

222
analyze_cooking_tasks.py Normal file
View file

@ -0,0 +1,222 @@
import os
import json
import re
from collections import defaultdict
def extract_cooking_items(exp_dir):
"""Extract cooking items from experiment directory name."""
# Remove prefix and blocked access part
clean_name = re.sub(r'^multiagent_cooking_', '', exp_dir)
clean_name = re.sub(r'_blocked_access_[0-9_]+$', '', clean_name)
# Extract individual items
items = []
for item_match in re.finditer(r'([0-9]+)_([a-zA-Z_]+)', clean_name):
count = int(item_match.group(1))
item = item_match.group(2)
# Remove trailing underscores to fix the item name issue
item = item.rstrip('_')
items.append(item)
return items
def analyze_experiments(root_dir):
# Store results by number of blocked agents
blocked_access_results = defaultdict(lambda: {
"success": 0,
"total": 0,
"cake_success": 0,
"cake_total": 0,
"non_cake_success": 0,
"non_cake_total": 0
})
# Store results by cooking item
cooking_item_results = defaultdict(lambda: {
"success": 0,
"total": 0
})
# Keep track of all unique cooking items
all_cooking_items = set()
# Get a list of all experiment directories
experiment_dirs = [d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))
and d.startswith("multiagent_cooking_")]
for exp_dir in experiment_dirs:
# Extract cooking items
cooking_items = extract_cooking_items(exp_dir)
# Add to unique items set
all_cooking_items.update(cooking_items)
# Check if experiment involves cake
has_cake = any(item == "cake" for item in cooking_items)
# Extract blocked access information from directory name
blocked_access_match = re.search(r'blocked_access_([0-9_]+)$', exp_dir)
if blocked_access_match:
blocked_access_str = blocked_access_match.group(1)
# Count how many agents have blocked access
num_blocked_agents = len(blocked_access_str.split('_'))
blocked_key = f"{num_blocked_agents} agent(s)"
else:
# No agents blocked
blocked_key = "0 agent(s)"
# Check if the task was successful
is_successful = False
full_exp_path = os.path.join(root_dir, exp_dir)
# Get all JSON files in the experiment directory
agent_files = [f for f in os.listdir(full_exp_path) if f.endswith(".json")]
# Check each agent file for success information
for agent_file in agent_files:
agent_file_path = os.path.join(full_exp_path, agent_file)
try:
with open(agent_file_path, 'r') as f:
agent_data = json.load(f)
# Check for success in the turns data
if "turns" in agent_data:
for turn in agent_data["turns"]:
if turn.get("role") == "system" and "content" in turn:
if isinstance(turn["content"], str) and "Task ended with score : 1" in turn["content"]:
is_successful = True
break
# If we found success, no need to check other files
if is_successful:
break
except (json.JSONDecodeError, IOError) as e:
print(f"Error reading {agent_file_path}: {e}")
# Continue to check other agent files instead of failing
continue
# Update cooking item results
for item in cooking_items:
cooking_item_results[item]["total"] += 1
if is_successful:
cooking_item_results[item]["success"] += 1
# Update the appropriate blocked access counters
# First update the category-specific counters
if has_cake:
blocked_access_results[blocked_key]["cake_total"] += 1
if is_successful:
blocked_access_results[blocked_key]["cake_success"] += 1
else:
blocked_access_results[blocked_key]["non_cake_total"] += 1
if is_successful:
blocked_access_results[blocked_key]["non_cake_success"] += 1
# Only count non-cake experiments in the main totals
blocked_access_results[blocked_key]["total"] += 1
if is_successful:
blocked_access_results[blocked_key]["success"] += 1
return blocked_access_results, cooking_item_results, all_cooking_items
def print_blocked_results(results):
print("\nExperiment Results by Number of Agents with Blocked Access (Excluding Cake Experiments):")
print("=" * 80)
print(f"{'Blocked Agents':<15} | {'Success Rate':<15} | {'Success/Total':<15} | {'Cake Tasks':<15} | {'Non-Cake Tasks':<15}")
print("-" * 80)
# Calculate totals
total_success = 0
total_experiments = 0
total_cake = 0
total_non_cake = 0
# Sort by number of blocked agents
for key in sorted(results.keys(), key=lambda x: int(x.split()[0])):
success = results[key]["success"]
total = results[key]["total"]
cake_total = results[key]["cake_total"]
non_cake_total = results[key]["non_cake_total"]
# Verify that non_cake_total matches total
if non_cake_total != total:
print(f"Warning: Non-cake total ({non_cake_total}) doesn't match the total ({total}) for {key}")
total_success += success
total_experiments += total
total_cake += cake_total
total_non_cake += non_cake_total
success_rate = (success / total * 100) if total > 0 else 0
print(f"{key:<15} | {success_rate:>6.2f}% | {success}/{total:<13} | {cake_total:<15} | {non_cake_total:<15}")
# Calculate overall success rate (excluding cake experiments)
overall_success_rate = (total_success / total_experiments * 100) if total_experiments > 0 else 0
print("-" * 80)
print(f"{'Overall':<15} | {overall_success_rate:>6.2f}% | {total_success}/{total_experiments:<13} | {total_cake:<15} | {total_non_cake:<15}")
# Print cake experiment details
print("\nCake Experiment Details:")
print("=" * 60)
print(f"{'Blocked Agents':<15} | {'Success Rate':<15} | {'Success/Total':<15}")
print("-" * 60)
cake_total_success = 0
cake_total_experiments = 0
for key in sorted(results.keys(), key=lambda x: int(x.split()[0])):
cake_success = results[key]["cake_success"]
cake_total = results[key]["cake_total"]
cake_total_success += cake_success
cake_total_experiments += cake_total
cake_success_rate = (cake_success / cake_total * 100) if cake_total > 0 else 0
print(f"{key:<15} | {cake_success_rate:>6.2f}% | {cake_success}/{cake_total}")
cake_overall_success_rate = (cake_total_success / cake_total_experiments * 100) if cake_total_experiments > 0 else 0
print("-" * 60)
print(f"{'Overall':<15} | {cake_overall_success_rate:>6.2f}% | {cake_total_success}/{cake_total_experiments}")
def print_cooking_items(cooking_items):
print("\nUnique Cooking Items Found:")
print("=" * 60)
print(", ".join(sorted(cooking_items)))
print(f"Total unique items: {len(cooking_items)}")
def print_item_results(item_results):
print("\nExperiment Results by Cooking Item:")
print("=" * 60)
print(f"{'Cooking Item':<20} | {'Success Rate':<15} | {'Success/Total':<15}")
print("-" * 60)
# Sort by item name
for item in sorted(item_results.keys()):
success = item_results[item]["success"]
total = item_results[item]["total"]
success_rate = (success / total * 100) if total > 0 else 0
print(f"{item:<20} | {success_rate:>6.2f}% | {success}/{total}")
print("-" * 60)
def main():
# Update this path to your experiments directory
experiments_root = "../results/llama_70b_hells_kitchen_cooking_tasks"
print(f"Analyzing experiments in: {os.path.abspath(experiments_root)}")
blocked_results, item_results, unique_items = analyze_experiments(experiments_root)
print_blocked_results(blocked_results)
print_cooking_items(unique_items)
print_item_results(item_results)
if __name__ == "__main__":
main()

View file

@ -274,8 +274,8 @@ def launch_server_experiment(task_path,
script_content += f"{cp_cmd}\n"
script_content += "sleep 1\n"
if s3:
s3_cmd = f"aws s3 cp {agent_file_path} s3://{bucket_name}/{exp_name}/{task_id}/{agent}_{_}.json"
s3_upload_experiment = f"aws s3 cp {agent_file_path} s3://{bucket_name}/{exp_name}/{task_id}/{agent}_{_}.json"
s3_cmd = f"aws s3 cp {agent_file_path} s3://{bucket_name}/{task_type}/{model}/{exp_name}/{task_id}/{agent}_{_}.json"
s3_upload_experiment = f"aws s3 cp {agent_file_path} s3://{bucket_name}/{task_type}/{model}/{exp_name}/{task_id}/{agent}_{_}.json"
script_content += f"echo 'Uploading {agent_file_path} to S3'\n"
script_content += f"echo '{s3_cmd}'\n"
script_content += f"{s3_cmd}\n"
@ -283,7 +283,7 @@ def launch_server_experiment(task_path,
script_content += f"sleep 10\n"
if s3:
for agent in agent_names:
script_content += f"aws s3 cp bots/{agent} s3://{bucket_name}/{exp_name}/bots/{agent} --recursive\n"
script_content += f"aws s3 cp bots/{agent} s3://{bucket_name}/{task_type}/{model}/{exp_name}/bots/{agent} --recursive\n"
# Create a temporary shell script file
script_file = f"./tmp/experiment_script_{session_name}.sh"

View file

@ -39,7 +39,7 @@ export default
"code_timeout_mins": -1, // minutes code is allowed to run. -1 for no timeout
"relevant_docs_count": 5, // Parameter: -1 = all, 0 = no references, 5 = five references. If exceeding the maximum, all reference documents are returned.
"max_messages": 150, // max number of messages to keep in context
"max_messages": process.env.MAX_MESSAGES || 15, // max number of messages to keep in context
"num_examples": 2, // number of examples to give to the model
"max_commands": -1, // max number of commands that can be used in consecutive responses. -1 for no limit
"verbose_commands": true, // show full command syntax

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,16 @@
{
"num_tasks": 199,
"avg_depth": 2.2525252525252526,
"std_depth": 0.9248907589704736,
"num_tasks_based_depth": {
"0": 98,
"1": 72,
"2": 20,
"3": 8
},
"num_missing_resources": {
"0": 116,
"1": 76,
"2": 6
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,16 @@
{
"num_tasks": 184,
"avg_depth": 2.4754098360655736,
"std_depth": 0.9684626549338964,
"num_tasks_based_depth": {
"0": 72,
"1": 78,
"2": 21,
"3": 12
},
"num_missing_resources": {
"0": 108,
"1": 66,
"2": 9
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,15 @@
{
"num_tasks": 129,
"avg_depth": 2.5625,
"std_depth": 1.0879309490955758,
"num_tasks_based_depth": {
"0": 56,
"1": 52,
"2": 12,
"3": 8
},
"num_missing_resources": {
"0": 100,
"1": 28
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,15 @@
{
"num_tasks": 131,
"avg_depth": 2.8461538461538463,
"std_depth": 0.9880948137434719,
"num_tasks_based_depth": {
"0": 45,
"1": 60,
"2": 15,
"3": 10
},
"num_missing_resources": {
"0": 95,
"1": 35
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,14 @@
{
"num_tasks": 205,
"avg_depth": 2.1176470588235294,
"std_depth": 0.7578881603955948,
"num_tasks_based_depth": {
"0": 100,
"1": 76,
"2": 28
},
"num_missing_resources": {
"0": 145,
"1": 59
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,14 @@
{
"num_tasks": 172,
"avg_depth": 2.280701754385965,
"std_depth": 0.6947013990604671,
"num_tasks_based_depth": {
"0": 63,
"1": 81,
"2": 27
},
"num_missing_resources": {
"0": 135,
"1": 36
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,14 @@
{
"num_tasks": 153,
"avg_depth": 2.1578947368421053,
"std_depth": 0.7443229275647865,
"num_tasks_based_depth": {
"0": 72,
"1": 60,
"2": 20
},
"num_missing_resources": {
"0": 128,
"1": 24
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,14 @@
{
"num_tasks": 136,
"avg_depth": 2.259259259259259,
"std_depth": 0.6436350813697311,
"num_tasks_based_depth": {
"0": 50,
"1": 65,
"2": 20
},
"num_missing_resources": {
"0": 110,
"1": 25
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,16 @@
{
"num_tasks": 276,
"avg_depth": 2.260869565217391,
"std_depth": 0.7642780796194337,
"num_tasks_based_depth": {
"0": 100,
"1": 100,
"2": 76
},
"num_missing_resources": {
"0": 148,
"1": 91,
"2": 35,
"3": 2
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,15 @@
{
"num_tasks": 281,
"avg_depth": 2.2562277580071175,
"std_depth": 0.7487869226953681,
"num_tasks_based_depth": {
"0": 100,
"1": 100,
"2": 81
},
"num_missing_resources": {
"0": 171,
"1": 79,
"2": 31
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,16 @@
{
"num_tasks": 248,
"avg_depth": 2.0766129032258065,
"std_depth": 0.8020748166140171,
"num_tasks_based_depth": {
"0": 100,
"1": 100,
"2": 48
},
"num_missing_resources": {
"0": 155,
"1": 66,
"2": 25,
"3": 2
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,15 @@
{
"num_tasks": 240,
"avg_depth": 2.0166666666666666,
"std_depth": 0.7525881269917101,
"num_tasks_based_depth": {
"0": 100,
"1": 100,
"2": 40
},
"num_missing_resources": {
"0": 173,
"1": 64,
"2": 3
}
}