This commit is contained in:
hlillemark 2025-03-16 18:56:48 -07:00
commit dcfa9c01ba
29 changed files with 81511 additions and 15 deletions

View file

@ -0,0 +1,39 @@
import boto3
import os
import json
import re
from botocore.exceptions import ClientError
import json
import argparse
from tqdm import tqdm
import glob
def analyze_json_file(file_path):
"""
Analyzes a single JSON file to extract the task outcome.
Args:
file_path (str): Path to the JSON file.
Returns:
str or None: The task outcome string if found, otherwise None.
"""
try:
with open(file_path, 'r') as f:
data = json.load(f)
if 'turns' in data and isinstance(data['turns'], list):
for turn in reversed(data['turns']): # Check turns from the end
if turn.get('role') == 'system' and isinstance(turn.get('content'), str):
if "Task successful ended with code : 2" in turn['content'] or "Task ended in score: 1" in turn["content"]:
return True
return False
except FileNotFoundError:
print(f"Error: File not found: {file_path}")
return None
except json.JSONDecodeError:
print(f"Error: Invalid JSON format in: {file_path}")
return None
except Exception as e:
print(f"An unexpected error occurred while processing {file_path}: {e}")
return None

222
analyze_cooking_tasks.py Normal file
View file

@ -0,0 +1,222 @@
import os
import json
import re
from collections import defaultdict
def extract_cooking_items(exp_dir):
"""Extract cooking items from experiment directory name."""
# Remove prefix and blocked access part
clean_name = re.sub(r'^multiagent_cooking_', '', exp_dir)
clean_name = re.sub(r'_blocked_access_[0-9_]+$', '', clean_name)
# Extract individual items
items = []
for item_match in re.finditer(r'([0-9]+)_([a-zA-Z_]+)', clean_name):
count = int(item_match.group(1))
item = item_match.group(2)
# Remove trailing underscores to fix the item name issue
item = item.rstrip('_')
items.append(item)
return items
def analyze_experiments(root_dir):
# Store results by number of blocked agents
blocked_access_results = defaultdict(lambda: {
"success": 0,
"total": 0,
"cake_success": 0,
"cake_total": 0,
"non_cake_success": 0,
"non_cake_total": 0
})
# Store results by cooking item
cooking_item_results = defaultdict(lambda: {
"success": 0,
"total": 0
})
# Keep track of all unique cooking items
all_cooking_items = set()
# Get a list of all experiment directories
experiment_dirs = [d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))
and d.startswith("multiagent_cooking_")]
for exp_dir in experiment_dirs:
# Extract cooking items
cooking_items = extract_cooking_items(exp_dir)
# Add to unique items set
all_cooking_items.update(cooking_items)
# Check if experiment involves cake
has_cake = any(item == "cake" for item in cooking_items)
# Extract blocked access information from directory name
blocked_access_match = re.search(r'blocked_access_([0-9_]+)$', exp_dir)
if blocked_access_match:
blocked_access_str = blocked_access_match.group(1)
# Count how many agents have blocked access
num_blocked_agents = len(blocked_access_str.split('_'))
blocked_key = f"{num_blocked_agents} agent(s)"
else:
# No agents blocked
blocked_key = "0 agent(s)"
# Check if the task was successful
is_successful = False
full_exp_path = os.path.join(root_dir, exp_dir)
# Get all JSON files in the experiment directory
agent_files = [f for f in os.listdir(full_exp_path) if f.endswith(".json")]
# Check each agent file for success information
for agent_file in agent_files:
agent_file_path = os.path.join(full_exp_path, agent_file)
try:
with open(agent_file_path, 'r') as f:
agent_data = json.load(f)
# Check for success in the turns data
if "turns" in agent_data:
for turn in agent_data["turns"]:
if turn.get("role") == "system" and "content" in turn:
if isinstance(turn["content"], str) and "Task ended with score : 1" in turn["content"]:
is_successful = True
break
# If we found success, no need to check other files
if is_successful:
break
except (json.JSONDecodeError, IOError) as e:
print(f"Error reading {agent_file_path}: {e}")
# Continue to check other agent files instead of failing
continue
# Update cooking item results
for item in cooking_items:
cooking_item_results[item]["total"] += 1
if is_successful:
cooking_item_results[item]["success"] += 1
# Update the appropriate blocked access counters
# First update the category-specific counters
if has_cake:
blocked_access_results[blocked_key]["cake_total"] += 1
if is_successful:
blocked_access_results[blocked_key]["cake_success"] += 1
else:
blocked_access_results[blocked_key]["non_cake_total"] += 1
if is_successful:
blocked_access_results[blocked_key]["non_cake_success"] += 1
# Only count non-cake experiments in the main totals
blocked_access_results[blocked_key]["total"] += 1
if is_successful:
blocked_access_results[blocked_key]["success"] += 1
return blocked_access_results, cooking_item_results, all_cooking_items
def print_blocked_results(results):
print("\nExperiment Results by Number of Agents with Blocked Access (Excluding Cake Experiments):")
print("=" * 80)
print(f"{'Blocked Agents':<15} | {'Success Rate':<15} | {'Success/Total':<15} | {'Cake Tasks':<15} | {'Non-Cake Tasks':<15}")
print("-" * 80)
# Calculate totals
total_success = 0
total_experiments = 0
total_cake = 0
total_non_cake = 0
# Sort by number of blocked agents
for key in sorted(results.keys(), key=lambda x: int(x.split()[0])):
success = results[key]["success"]
total = results[key]["total"]
cake_total = results[key]["cake_total"]
non_cake_total = results[key]["non_cake_total"]
# Verify that non_cake_total matches total
if non_cake_total != total:
print(f"Warning: Non-cake total ({non_cake_total}) doesn't match the total ({total}) for {key}")
total_success += success
total_experiments += total
total_cake += cake_total
total_non_cake += non_cake_total
success_rate = (success / total * 100) if total > 0 else 0
print(f"{key:<15} | {success_rate:>6.2f}% | {success}/{total:<13} | {cake_total:<15} | {non_cake_total:<15}")
# Calculate overall success rate (excluding cake experiments)
overall_success_rate = (total_success / total_experiments * 100) if total_experiments > 0 else 0
print("-" * 80)
print(f"{'Overall':<15} | {overall_success_rate:>6.2f}% | {total_success}/{total_experiments:<13} | {total_cake:<15} | {total_non_cake:<15}")
# Print cake experiment details
print("\nCake Experiment Details:")
print("=" * 60)
print(f"{'Blocked Agents':<15} | {'Success Rate':<15} | {'Success/Total':<15}")
print("-" * 60)
cake_total_success = 0
cake_total_experiments = 0
for key in sorted(results.keys(), key=lambda x: int(x.split()[0])):
cake_success = results[key]["cake_success"]
cake_total = results[key]["cake_total"]
cake_total_success += cake_success
cake_total_experiments += cake_total
cake_success_rate = (cake_success / cake_total * 100) if cake_total > 0 else 0
print(f"{key:<15} | {cake_success_rate:>6.2f}% | {cake_success}/{cake_total}")
cake_overall_success_rate = (cake_total_success / cake_total_experiments * 100) if cake_total_experiments > 0 else 0
print("-" * 60)
print(f"{'Overall':<15} | {cake_overall_success_rate:>6.2f}% | {cake_total_success}/{cake_total_experiments}")
def print_cooking_items(cooking_items):
print("\nUnique Cooking Items Found:")
print("=" * 60)
print(", ".join(sorted(cooking_items)))
print(f"Total unique items: {len(cooking_items)}")
def print_item_results(item_results):
print("\nExperiment Results by Cooking Item:")
print("=" * 60)
print(f"{'Cooking Item':<20} | {'Success Rate':<15} | {'Success/Total':<15}")
print("-" * 60)
# Sort by item name
for item in sorted(item_results.keys()):
success = item_results[item]["success"]
total = item_results[item]["total"]
success_rate = (success / total * 100) if total > 0 else 0
print(f"{item:<20} | {success_rate:>6.2f}% | {success}/{total}")
print("-" * 60)
def main():
# Update this path to your experiments directory
experiments_root = "../results/llama_70b_hells_kitchen_cooking_tasks"
print(f"Analyzing experiments in: {os.path.abspath(experiments_root)}")
blocked_results, item_results, unique_items = analyze_experiments(experiments_root)
print_blocked_results(blocked_results)
print_cooking_items(unique_items)
print_item_results(item_results)
if __name__ == "__main__":
main()

View file

@ -8,6 +8,8 @@ import re
import sys
import os
import time
import filecmp
import json
BLOCKED_ACTIONS_COOKING = [
'!activate', '!attackPlayer', '!checkBlueprint', '!checkBlueprintLevel',
@ -225,6 +227,8 @@ def launch_server_experiment(task_path,
subprocess.run(['tmux', 'new-session', '-d', '-s', session_name], check=True)
# set environment variables
set_environment_variable_tmux_session(session_name, "MINECRAFT_PORT", server_port)
set_environment_variable_tmux_session(session_name, "MINDSERVER_PORT", mindserver_port)
@ -233,13 +237,14 @@ def launch_server_experiment(task_path,
set_environment_variable_tmux_session(session_name, "INSECURE_CODING", "true")
# you need to add the bots to the world first before you can add them as op
cmd = f"node main.js --task_path example_tasks.json --task_id debug_{num_agents}_agent_timeout"
# cmd = f"node main.js --task_path example_tasks.json --task_id debug_{num_agents}_agent_timeout"
subprocess.run(["tmux", "send-keys", "-t", session_name, cmd, "C-m"])
# subprocess.run(["tmux", "send-keys", "-t", session_name, cmd, "C-m"])
time.sleep(40)
# time.sleep(40)
subprocess.run(["tmux", "send-keys", "-t", "server_" + session_name, f"/op {agent_names[0]}", "C-m"])
# subprocess.run(["tmux", "send-keys", "-t", "server_" + session_name, f"/op @a", "C-m"])
make_ops(agent_names, session_name)
# add the bots as op
# op_script_content = "sleep 5\n\op @p" * 20
@ -252,6 +257,11 @@ def launch_server_experiment(task_path,
elif task_type == "construction":
set_environment_variable_tmux_session(session_name, "BLOCKED_ACTIONS", BLOCKED_ACTIONS_CONSTRUCTION)
split_task_path = task_path.split("/")
if len(split_task_path) > 1:
task_path_name = split_task_path[-2]
else:
task_path_name = "tasks"
script_content = ""
for task_id in task_ids:
@ -274,8 +284,7 @@ def launch_server_experiment(task_path,
script_content += f"{cp_cmd}\n"
script_content += "sleep 1\n"
if s3:
s3_cmd = f"aws s3 cp {agent_file_path} s3://{bucket_name}/{exp_name}/{task_id}/{agent}_{_}.json"
s3_upload_experiment = f"aws s3 cp {agent_file_path} s3://{bucket_name}/{exp_name}/{task_id}/{agent}_{_}.json"
s3_cmd = f"aws s3 cp {agent_file_path} s3://{bucket_name}/{task_type}/{model}/{task_path_name}/{exp_name}/{task_id}/{agent}_{_}.json"
script_content += f"echo 'Uploading {agent_file_path} to S3'\n"
script_content += f"echo '{s3_cmd}'\n"
script_content += f"{s3_cmd}\n"
@ -283,12 +292,42 @@ def launch_server_experiment(task_path,
script_content += f"sleep 10\n"
if s3:
for agent in agent_names:
script_content += f"aws s3 cp bots/{agent} s3://{bucket_name}/{exp_name}/bots/{agent} --recursive\n"
script_content += f"aws s3 cp bots/{agent} s3://{bucket_name}/{task_type}/{model}/{task_path_name}/{exp_name}/bots/{agent} --recursive\n"
# Create a temporary shell script file
script_file = f"./tmp/experiment_script_{session_name}.sh"
make_script_file_and_run(script_content, session_name, script_file)
def make_ops(agent_names, session_name):
"""Make the agents operators in the Minecraft world."""
print('Making agents operators...')
cmd = f"node main.js --task_path example_tasks.json --task_id debug_{len(agent_names)}_agent_timeout"
subprocess.run(["tmux", "send-keys", "-t", session_name, cmd, "C-m"])
time.sleep(30)
subprocess.run(["tmux", "send-keys", "-t", "server_" + session_name, f"/op @a", "C-m"])
agents_op = check_agent_ops(agent_names, ops_file=f"./server_data_{session_name}/ops.json")
if agents_op:
print("Agents are operators! You are good to go :D")
else:
print("Agents are not operators! Something went wrong :(")
make_ops(agent_names, session_name)
def check_agent_ops(agent_names, ops_file="ops.json"):
with open(ops_file, "r") as f:
ops_data = json.load(f)
ops_names = [op["name"] for op in ops_data]
for agent in agent_names:
if agent not in ops_names:
return False
return True
def make_script_file_and_run(script_content, session_name, file_name):
script_dir = os.path.dirname(file_name)
os.makedirs(script_dir, exist_ok=True)
@ -372,6 +411,23 @@ def copy_server_files(source_path, dest_path):
print(f"Server files copied to {dest_path}")
except Exception as e:
print(f"Error copying server files: {e}")
time.sleep(10)
same_files = check_same_files(source_path, dest_path)
if not same_files:
copy_server_files(source_path, dest_path)
print("The destination path does not contain all the same files as the source path.")
else:
print("The destination path contains all the same files as the source path.")
def check_same_files(d1, d2):
items1 = set(os.listdir(d1))
items2 = set(os.listdir(d2))
if items1 != items2:
return False
return True
def delete_server_files(dest_path):
"""Delete server files from the specified location."""

View file

@ -39,7 +39,7 @@ export default
"code_timeout_mins": -1, // minutes code is allowed to run. -1 for no timeout
"relevant_docs_count": 5, // Parameter: -1 = all, 0 = no references, 5 = five references. If exceeding the maximum, all reference documents are returned.
"max_messages": 150, // max number of messages to keep in context
"max_messages": process.env.MAX_MESSAGES || 15, // max number of messages to keep in context
"num_examples": 2, // number of examples to give to the model
"max_commands": -1, // max number of commands that can be used in consecutive responses. -1 for no limit
"verbose_commands": true, // show full command syntax

View file

@ -830,7 +830,7 @@ export async function putInChest(bot, itemName, num=-1) {
export async function takeFromChest(bot, itemName, num=-1) {
/**
* Take the given item from the nearest chest.
* Take the given item from the nearest chest, potentially from multiple slots.
* @param {MinecraftBot} bot, reference to the minecraft bot.
* @param {string} itemName, the item or block name to take from the chest.
* @param {number} num, the number of items to take from the chest. Defaults to -1, which takes all items.
@ -845,17 +845,33 @@ export async function takeFromChest(bot, itemName, num=-1) {
}
await goToPosition(bot, chest.position.x, chest.position.y, chest.position.z, 2);
const chestContainer = await bot.openContainer(chest);
let item = chestContainer.containerItems().find(item => item.name === itemName);
if (!item) {
// Find all matching items in the chest
let matchingItems = chestContainer.containerItems().filter(item => item.name === itemName);
if (matchingItems.length === 0) {
log(bot, `Could not find any ${itemName} in the chest.`);
await chestContainer.close();
return false;
}
let to_take = num === -1 ? item.count : Math.min(num, item.count);
await chestContainer.withdraw(item.type, null, to_take);
let totalAvailable = matchingItems.reduce((sum, item) => sum + item.count, 0);
let remaining = num === -1 ? totalAvailable : Math.min(num, totalAvailable);
let totalTaken = 0;
// Take items from each slot until we've taken enough or run out
for (const item of matchingItems) {
if (remaining <= 0) break;
let toTakeFromSlot = Math.min(remaining, item.count);
await chestContainer.withdraw(item.type, null, toTakeFromSlot);
totalTaken += toTakeFromSlot;
remaining -= toTakeFromSlot;
}
await chestContainer.close();
log(bot, `Successfully took ${to_take} ${itemName} from the chest.`);
return true;
log(bot, `Successfully took ${totalTaken} ${itemName} from the chest.`);
return totalTaken > 0;
}
export async function viewChest(bot) {

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,16 @@
{
"num_tasks": 199,
"avg_depth": 2.2525252525252526,
"std_depth": 0.9248907589704736,
"num_tasks_based_depth": {
"0": 98,
"1": 72,
"2": 20,
"3": 8
},
"num_missing_resources": {
"0": 116,
"1": 76,
"2": 6
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,16 @@
{
"num_tasks": 184,
"avg_depth": 2.4754098360655736,
"std_depth": 0.9684626549338964,
"num_tasks_based_depth": {
"0": 72,
"1": 78,
"2": 21,
"3": 12
},
"num_missing_resources": {
"0": 108,
"1": 66,
"2": 9
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,15 @@
{
"num_tasks": 129,
"avg_depth": 2.5625,
"std_depth": 1.0879309490955758,
"num_tasks_based_depth": {
"0": 56,
"1": 52,
"2": 12,
"3": 8
},
"num_missing_resources": {
"0": 100,
"1": 28
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,15 @@
{
"num_tasks": 131,
"avg_depth": 2.8461538461538463,
"std_depth": 0.9880948137434719,
"num_tasks_based_depth": {
"0": 45,
"1": 60,
"2": 15,
"3": 10
},
"num_missing_resources": {
"0": 95,
"1": 35
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,14 @@
{
"num_tasks": 205,
"avg_depth": 2.1176470588235294,
"std_depth": 0.7578881603955948,
"num_tasks_based_depth": {
"0": 100,
"1": 76,
"2": 28
},
"num_missing_resources": {
"0": 145,
"1": 59
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,14 @@
{
"num_tasks": 172,
"avg_depth": 2.280701754385965,
"std_depth": 0.6947013990604671,
"num_tasks_based_depth": {
"0": 63,
"1": 81,
"2": 27
},
"num_missing_resources": {
"0": 135,
"1": 36
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,14 @@
{
"num_tasks": 153,
"avg_depth": 2.1578947368421053,
"std_depth": 0.7443229275647865,
"num_tasks_based_depth": {
"0": 72,
"1": 60,
"2": 20
},
"num_missing_resources": {
"0": 128,
"1": 24
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,14 @@
{
"num_tasks": 136,
"avg_depth": 2.259259259259259,
"std_depth": 0.6436350813697311,
"num_tasks_based_depth": {
"0": 50,
"1": 65,
"2": 20
},
"num_missing_resources": {
"0": 110,
"1": 25
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,16 @@
{
"num_tasks": 276,
"avg_depth": 2.260869565217391,
"std_depth": 0.7642780796194337,
"num_tasks_based_depth": {
"0": 100,
"1": 100,
"2": 76
},
"num_missing_resources": {
"0": 148,
"1": 91,
"2": 35,
"3": 2
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,15 @@
{
"num_tasks": 281,
"avg_depth": 2.2562277580071175,
"std_depth": 0.7487869226953681,
"num_tasks_based_depth": {
"0": 100,
"1": 100,
"2": 81
},
"num_missing_resources": {
"0": 171,
"1": 79,
"2": 31
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,16 @@
{
"num_tasks": 248,
"avg_depth": 2.0766129032258065,
"std_depth": 0.8020748166140171,
"num_tasks_based_depth": {
"0": 100,
"1": 100,
"2": 48
},
"num_missing_resources": {
"0": 155,
"1": 66,
"2": 25,
"3": 2
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,15 @@
{
"num_tasks": 240,
"avg_depth": 2.0166666666666666,
"std_depth": 0.7525881269917101,
"num_tasks_based_depth": {
"0": 100,
"1": 100,
"2": 40
},
"num_missing_resources": {
"0": 173,
"1": 64,
"2": 3
}
}