mindcraft/tasks/evaluation_script.py
Johnathan Walker f7947ec3c2 refactor: Eliminate code duplication and enhance development workflow
- Created tasks/experiment_utils.py for shared utility functions
- Streamlined entry point scripts by moving common code to utils
- Enhanced .gitignore with comprehensive Python development patterns
- Validated and fixed documentation links across all markdown files
- Applied final code quality improvements and optimization
2025-06-15 23:12:34 -04:00

511 lines
No EOL
24 KiB
Python

import argparse
import json
import subprocess
import time
from datetime import datetime
import os
import logging
import pandas as pd
# Set up basic logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
from tasks.evaluation import (
aggregate_results,
check_folder_results,
)
from tasks.experiment_utils import (
update_keys_json,
set_environment_variable_tmux_session,
make_profiles,
create_server_files,
edit_file,
clean_up_server_files,
launch_world,
make_ops,
make_script_file_and_run,
)
from typing import List, Dict, Any, Tuple
# Task-specific blocked actions constants
BLOCKED_ACTIONS_COOKING = [
'!activate', '!attackPlayer', '!checkBlueprint', '!checkBlueprintLevel',
'!clearChat', '!clearFurnace', '!consume', '!craftable', '!discard',
'!endGoal', '!entities', '!equip', '!followPlayer', '!getBlueprint', '!getBlueprintLevel',
'!goToBed', '!help', '!modes', '!moveAway', '!newAction', '!placeHere', '!putInChest',
'!restart', '!setMode', '!stay', '!stfu', '!stop'
]
BLOCKED_ACTIONS_CRAFTING = [
'!activate', '!attack', '!attackPlayer', '!checkBlueprint', '!checkBlueprintLevel',
'!clearChat', '!clearFurnace', '!consume', '!craftable', '!discard', '!endConversation',
'!endGoal', '!entities', '!followPlayer', '!getBlueprint', '!getBlueprintLevel',
'!goToBed', '!help', '!modes', '!newAction', '!putInChest', '!restart',
'!searchForEntity', '!setMode', '!stay', '!stfu', '!stop', '!takeFromChest',
'!viewChest'
]
BLOCKED_ACTIONS_CONSTRUCTION = [
'!activate', '!attackPlayer', '!clearChat', '!clearFurnace', '!collectBlocks',
'!consume', '!craftable', '!discard', '!endConversation', '!endGoal', '!entities',
'!equip', '!followPlayer', '!getBlueprint', '!getBlueprintLevel', '!goToBed',
'!help', '!modes', '!moveAway', '!newAction', '!placeHere', '!putInChest',
'!restart', '!searchForBlock', '!searchForEntity', '!setMode', '!stay', '!stfu',
'!stop', '!takeFromChest', '!viewChest', '!craftRecipe', '!smeltItem'
]
def launch_parallel_experiments(task_path: str,
num_exp: int,
exp_name: str,
num_agents: int = 2,
model: str = "gpt-4o-mini",
api: str = "openai",
num_parallel: int = 1,
s3: bool = False,
bucket_name: str = "mindcraft-experiments",
template_profile: str = "profiles/tasks/collab_profile.json",
insecure_coding: bool = False,
url: str = "http://127.0.0.1:8000/v1",
max_messages: int = 15,
num_examples: int = 2,
no_pruning: bool = False,
block_conversation: bool = False,
run_in_tmux: bool = True) -> None:
"""
Orchestrates the launch of parallel experiments and monitors their progress.
This function splits tasks among a specified number of parallel servers,
launches them, and then enters a monitoring loop. It periodically checks
the experiment folder, aggregates results, prints progress, and uploads
to S3 if configured.
Args:
task_path (str): Path to the task definition file.
num_exp (int): Number of times to repeat each task.
exp_name (str): A unique name for this experiment run.
num_agents (int): The number of agents to use per task.
model (str): The model name to be used by the agents.
api (str): The API provider for the model.
num_parallel (int): The number of parallel servers/experiments to run.
s3 (bool): If True, upload results to S3.
bucket_name (str): The S3 bucket to use for uploads.
template_profile (str): Path to the agent profile template.
insecure_coding (bool): If True, enables insecure coding mode.
url (str): The URL for the model API (if applicable).
max_messages (int): Maximum number of messages before summarization.
num_examples (int): Number of examples to use in the prompt.
no_pruning (bool): If True, disables action pruning.
block_conversation (bool): If True, blocks conversation actions.
run_in_tmux (bool): If True, runs servers and scripts in tmux sessions.
"""
with open(task_path, 'r', encoding='utf-8') as file:
content = file.read()
json_data = json.loads(content)
task_ids = json_data.keys()
task_type = json_data[list(task_ids)[0]]["type"]
# split the task_ids into num_parallel groups
task_ids = list(task_ids)
task_ids_split = [task_ids[i::num_parallel] for i in range(num_parallel)]
if task_type == "cooking":
world_name = "Superflat"
elif task_type == "techtree":
world_name = "Forest"
elif task_type == "construction":
world_name = "Superflat"
else:
world_name = "Forest" # Default fallback
if run_in_tmux:
servers = create_server_files("./tasks/server_data/", num_parallel, world_name=world_name)
else:
servers = [(f"./tasks/server_data_{i}/", 55916 + i) for i in range(num_parallel)]
date_time = datetime.now().strftime("%m-%d_%H-%M")
experiments_folder = f"experiments/{exp_name}_{date_time}"
exp_name = f"{exp_name}_{date_time}"
split_task_path = task_path.split("/")
if len(split_task_path) > 1:
task_path_name = split_task_path[-2]
else:
task_path_name = "tasks"
s3_path = f"{bucket_name}/{task_type}/{model}/{task_path_name}/{exp_name}"
# start experiments
os.makedirs(experiments_folder, exist_ok=True)
for i, server in enumerate(servers):
launch_server_experiment(task_path,
task_ids_split[i],
num_exp,
server,
experiments_folder,
exp_name,
s3=s3,
bucket_name=bucket_name,
template_profile=template_profile,
model=model,
api=api,
insecure_coding=insecure_coding,
num_agents=num_agents,
url=url,
task_type=task_type,
s3_path=s3_path,
max_messages=max_messages,
num_examples=num_examples,
no_pruning=no_pruning,
block_conversation=block_conversation,
run_in_tmux=run_in_tmux)
time.sleep(5)
total_num_tasks = len(task_ids)
total_num_experiments = total_num_tasks * num_exp
total_run = 0
with open(task_path, 'r') as f:
task_definitions = json.load(f)
while total_run < total_num_experiments:
# Get all subfolders that have been created
try:
evaluated_folders = [f.path for f in os.scandir(experiments_folder) if f.is_dir()]
except FileNotFoundError:
evaluated_folders = []
if not evaluated_folders:
logging.info("No experiment folders found yet. Waiting...")
time.sleep(60)
continue
results_df = aggregate_results(evaluated_folders, task_definitions)
if results_df.empty:
total_run = 0
success_rate = 0.0
status_dist_str = "No results yet."
else:
total_run = len(results_df)
success_rate = results_df['overall_is_successful'].mean()
status_dist = results_df['overall_completion_status'].value_counts(normalize=True).to_dict()
status_dist_str = ", ".join([f"{k}: {v:.2%}" for k, v in status_dist.items()])
logging.info(f"\n--- Progress Update ({datetime.now().strftime('%H:%M:%S')}) ---")
logging.info(f"Total tasks run: {total_run}/{total_num_experiments}")
logging.info(f"Overall Success Rate: {success_rate:.2%}")
logging.info(f"Completion Status: {status_dist_str}")
# Create a summary dictionary for logging
results_summary = {
"total_evaluated": total_run,
"success_rate": success_rate,
"completion_status_distribution": status_dist,
"exp_name": exp_name,
"template_profile": template_profile,
"model": model,
"api": api,
"num_agents": num_agents,
"task_path": task_path,
"task_type": task_type,
"max_messages": max_messages,
"num_examples": num_examples
}
# Save summary and detailed results
with open(f"{experiments_folder}/results.json", "w") as f:
json.dump(results_summary, f, indent=4, default=str)
if not results_df.empty:
results_df.to_csv(f"{experiments_folder}/detailed_results.csv", index=False)
if s3:
cmd_results = f"aws s3 cp {experiments_folder}/results.json s3://{s3_path}/results.json"
logging.info(cmd_results)
subprocess.run(cmd_results.split(), capture_output=True, text=True)
if not results_df.empty:
cmd_csv = f"aws s3 cp {experiments_folder}/detailed_results.csv s3://{s3_path}/detailed_results.csv"
logging.info(cmd_csv)
subprocess.run(cmd_csv.split(), capture_output=True, text=True)
time.sleep(60)
def launch_server_experiment(task_path: str,
task_ids: List[str],
num_exp: int,
server: Tuple[str, int],
experiments_folder: str,
exp_name: str = "exp",
num_agents: int = 2,
model: str = "gpt-4o",
api: str = "openai",
s3: bool = False,
bucket_name: str = "mindcraft-experiments",
template_profile: str = "profiles/tasks/collab_profile.json",
insecure_coding: bool = False,
url: str = "http://127.0.0.1:8000/v1",
task_type: str = "techtree",
s3_path: str = "",
max_messages: int = 15,
num_examples: int = 2,
no_pruning: bool = False,
block_conversation: bool = False,
run_in_tmux: bool = True) -> None:
"""
Launches and configures a single server instance for running experiments.
This function handles the setup for one of the parallel experiment instances.
It configures server properties, creates agent profiles, sets up tmux sessions
(if enabled), and generates the script that will run the tasks.
Args:
task_path (str): Path to the task definition file.
task_ids (List[str]): The specific task IDs this server will run.
num_exp (int): Number of times to repeat each task.
server (Tuple[str, int]): A tuple containing the server's path and port.
experiments_folder (str): The root folder for storing experiment results.
exp_name (str): The name of the experiment.
num_agents (int): The number of agents to use.
model (str): The model name to use.
api (str): The API provider for the model.
s3 (bool): If True, enable S3 uploads.
bucket_name (str): The name of the S3 bucket.
template_profile (str): Path to the agent profile template.
insecure_coding (bool): If True, enable insecure coding mode.
url (str): The URL for the model API.
task_type (str): The type of task being run.
s3_path (str): The base S3 path for uploads.
max_messages (int): Maximum messages before summarization.
num_examples (int): Number of examples for the prompt.
no_pruning (bool): If True, disable action pruning.
block_conversation (bool): If True, block conversation actions.
run_in_tmux (bool): If True, run in a tmux session.
"""
server_path, server_port = server
edit_file(os.path.join(server_path, "server.properties"), {"server-port": server_port})
mindserver_port = server_port - 55916 + 8080
# set up server and agents
session_name = str(server_port - 55916)
if num_agents == 1:
agent_names = [f"Andy_{session_name}"]
models = [model]
apis = [api]
elif num_agents == 2:
agent_names = [f"Andy_{session_name}", f"Jill_{session_name}"]
models = [model] * 2
apis = [api] * 2
else:
# Lets use an ordered list of 10 human names.
human_names = ["Andy", "Jill", "Bob", "Sally", "Mike", "Laura", "John", "Emma", "Tom", "Kate"]
agent_names = []
for i in range(num_agents):
name = human_names[i % len(human_names)]
agent_names.append(f"{name}_{session_name}")
models = [model] * num_agents
apis = [api] * num_agents
make_profiles(agent_names, models, apis, template_profile=template_profile, url=url)
agent_profiles = [f"./{agent}.json" for agent in agent_names]
if num_agents == 1:
agent_profiles_str = f"'[\"{agent_profiles[0]}\"]'"
elif num_agents == 2:
agent_profiles_str = f"'[\"{agent_profiles[0]}\", \"{agent_profiles[1]}\"]'"
else:
agent_profiles_str = "'["
for agent in agent_profiles[:-1]:
agent_profiles_str += f'\"{agent}\", '
agent_profiles_str += f"\"{agent_profiles[-1]}\"]'"
logging.info(agent_profiles_str)
if run_in_tmux:
logging.info("run in tmux is true")
launch_world(server_path, session_name="server_" + session_name, port=server_port)
subprocess.run(['tmux', 'new-session', '-d', '-s', session_name], check=True)
# set environment variables
if run_in_tmux:
set_environment_variable_tmux_session(session_name, "MINECRAFT_PORT", server_port)
set_environment_variable_tmux_session(session_name, "MINDSERVER_PORT", mindserver_port)
set_environment_variable_tmux_session(session_name, "PROFILES", agent_profiles_str)
set_environment_variable_tmux_session(session_name, "MAX_MESSAGES", str(max_messages))
set_environment_variable_tmux_session(session_name, "NUM_EXAMPLES", str(num_examples))
set_environment_variable_tmux_session(session_name, "LOG_ALL", "true")
if insecure_coding:
set_environment_variable_tmux_session(session_name, "INSECURE_CODING", "true")
make_ops(agent_names, session_name)
else:
agent_profiles_str = "["
for agent in agent_profiles[:-1]:
agent_profiles_str += f"\"{agent}\", "
agent_profiles_str += f"\"{agent_profiles[-1]}\"]"
logging.debug(agent_profiles_str)
os.environ["PROFILES"] = agent_profiles_str
os.environ["MAX_MESSAGES"] = str(max_messages)
os.environ["NUM_EXAMPLES"] = str(num_examples)
os.environ["LOG_ALL"] = "true"
run_script(task_path,
task_ids,
num_exp,
experiments_folder,
agent_names,
server_path,
s3=s3,
s3_path=s3_path,
session_name=session_name,
run_in_tmux=run_in_tmux)
def run_script(task_path: str,
task_ids: List[str],
num_exp: int,
experiments_folder: str,
agent_names: List[str],
server_path: str,
s3: bool = False,
s3_path: str = "mindcraft-experiments",
session_name: str = "0",
run_in_tmux: bool = True) -> None:
"""
Generates and executes a shell script to run a sequence of tasks.
This function creates a shell script that contains the `node main.js` commands
to run each task, along with commands to copy the resulting log files to the
correct experiment folder and upload them to S3 if enabled.
Args:
task_path (str): Path to the task definition file.
task_ids (List[str]): The list of task IDs to be run.
num_exp (int): The number of times to repeat each task.
experiments_folder (str): The root folder for storing results.
agent_names (List[str]): The names of the agents participating.
server_path (str): The path to the server directory.
s3 (bool): If True, include S3 upload commands in the script.
s3_path (str): The base S3 path for uploads.
session_name (str): The tmux session name to run the script in.
run_in_tmux (bool): If True, execute the script via tmux.
"""
script_content = ""
for task_id in task_ids:
# Create a separate folder for each task_id
task_folder = os.path.join(experiments_folder, str(task_id))
os.makedirs(task_folder, exist_ok=True)
assert os.path.exists(task_folder), f"Directory {task_folder} was not created"
logging.info(f"Created directory: {task_folder}")
cmd = f"node main.js --task_path \'{task_path}\' --task_id {task_id}"
for _ in range(num_exp):
script_content += f"{cmd}\n"
script_content += "sleep 2\n"
for agent in agent_names:
agent_file_path = os.path.join(task_folder, f"{agent}_{_}.json")
script_content += f"echo 'Saving to {agent_file_path}'\n"
cp_cmd = f"cp bots/{agent}/memory.json {agent_file_path}"
script_content += f"echo '{cp_cmd}'\n"
script_content += f"{cp_cmd}\n"
script_content += "sleep 1\n"
if s3:
s3_cmd = f"aws s3 cp {agent_file_path} s3://{s3_path}/{task_id}/{agent}_{_}.json"
script_content += f"echo 'Uploading {agent_file_path} to S3'\n"
script_content += f"echo '{s3_cmd}'\n"
script_content += f"{s3_cmd}\n"
script_content += "sleep 1\n"
script_content += f"sleep 10\n"
if s3:
for agent in agent_names:
script_content += f"aws s3 cp bots/{agent} s3://{s3_path}/bots/{agent} --recursive\n"
# Create a temporary shell script file
script_file = f"./tmp/experiment_script_{session_name}.sh"
make_script_file_and_run(script_content, script_file, session_name=session_name, run_in_tmux=run_in_tmux)
def main() -> None:
"""
Main entry point for the evaluation script.
Parses command-line arguments and orchestrates the experiment launch or
results-checking process.
"""
parser = argparse.ArgumentParser(description='Run Minecraft AI agent experiments')
parser.add_argument('--no_launch_world', action='store_true', help='Do not launch the Minecraft world')
parser.add_argument('--task_path', default="tasks/multiagent_crafting_tasks.json", help='Path to the task file')
parser.add_argument('--num_agents', default=2, type=int, help='Number of agents to run')
parser.add_argument('--num_exp', default=1, type=int, help='Number of experiments to run')
parser.add_argument('--num_parallel', default=1, type=int, help='Number of parallel servers to run')
parser.add_argument('--exp_name', default="exp", help='Name of the experiment')
parser.add_argument('--s3', action='store_true', help='Whether to upload to s3')
parser.add_argument('--bucket_name', default="mindcraft-experiments", help='Name of the s3 bucket')
parser.add_argument('--add_keys', action='store_true', help='Create the keys.json to match the environment variables')
parser.add_argument('--template_profile', default="profiles/tasks/crafting_profile.json", help='Model to use for the agents')
parser.add_argument('--model', default="gpt-4o-mini", help='Model to use for the agents')
parser.add_argument('--api', default="openai", help='API to use for the agents')
parser.add_argument('--insecure_coding', action='store_true', help='Enable insecure coding')
parser.add_argument('--url', default="http://127.0.0.1:8000/v1")
parser.add_argument('--max_messages', default=15, type=int, help='Maximum number of messages before summarizing')
parser.add_argument('--num_examples', default=2, type=int, help='Maximum number of turns before summarizing')
parser.add_argument('--no-pruning', action='store_true', help='Disable pruning of the actions')
parser.add_argument('--block_conversation', action='store_true', help='Block conversation actions')
parser.add_argument('--check', metavar='FOLDER_PATH', help='Check and evaluate results in the specified folder without running experiments')
parser.add_argument('--usernames', default="", help='Comma-separated list of usernames for the agents')
args = parser.parse_args()
logging.info(args)
# If --check flag is provided, evaluate results in the specified folder and exit
if args.check:
# The check function now also requires the task definition file.
check_folder_results(args.check, args.task_path)
return
if not args.no_launch_world:
try:
subprocess.run(['tmux', 'kill-server'], check=True)
except subprocess.CalledProcessError:
logging.info("No tmux session to kill")
# delete all server files
if not args.no_launch_world:
clean_up_server_files(args.num_parallel)
if args.add_keys:
update_keys_json()
# change task file to include usernames
with open(args.task_path, 'r') as f:
content = f.read()
task = json.loads(content)
# check if human count for first task is non zero
if "human_count" in task[list(task.keys())[0]]:
# check if human count is non zero
human_count = task[list(task.keys())[0]]["human_count"]
username_lst = args.usernames.replace(" ", "").split(",")
if len(username_lst) != human_count:
raise ValueError(f"Number of usernames provided ({len(username_lst)}) does not match human count ({human_count})")
if human_count > 0:
for task_id in task.keys():
task[task_id]["usernames"] = username_lst
# dump to task_path
with open(args.task_path, 'w') as f:
json.dump(task, f, indent=4)
launch_parallel_experiments(args.task_path,
num_exp=args.num_exp,
exp_name=args.exp_name,
num_parallel=args.num_parallel,
s3=args.s3,
bucket_name=args.bucket_name,
template_profile=args.template_profile,
model=args.model,
api=args.api,
insecure_coding=args.insecure_coding,
num_agents=args.num_agents,
url=args.url,
max_messages=args.max_messages,
num_examples=args.num_examples,
no_pruning=args.no_pruning,
block_conversation=args.block_conversation,
run_in_tmux=not args.no_launch_world)
if __name__ == "__main__":
main()