mirror of
https://github.com/kolbytn/mindcraft.git
synced 2025-07-28 10:55:27 +02:00

- Created tasks/experiment_utils.py for shared utility functions - Streamlined entry point scripts by moving common code to utils - Enhanced .gitignore with comprehensive Python development patterns - Validated and fixed documentation links across all markdown files - Applied final code quality improvements and optimization
511 lines
No EOL
24 KiB
Python
511 lines
No EOL
24 KiB
Python
import argparse
|
|
import json
|
|
import subprocess
|
|
import time
|
|
from datetime import datetime
|
|
import os
|
|
import logging
|
|
import pandas as pd
|
|
|
|
# Set up basic logging
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
|
|
from tasks.evaluation import (
|
|
aggregate_results,
|
|
check_folder_results,
|
|
)
|
|
|
|
from tasks.experiment_utils import (
|
|
update_keys_json,
|
|
set_environment_variable_tmux_session,
|
|
make_profiles,
|
|
create_server_files,
|
|
edit_file,
|
|
clean_up_server_files,
|
|
launch_world,
|
|
make_ops,
|
|
make_script_file_and_run,
|
|
)
|
|
|
|
from typing import List, Dict, Any, Tuple
|
|
|
|
# Task-specific blocked actions constants
|
|
BLOCKED_ACTIONS_COOKING = [
|
|
'!activate', '!attackPlayer', '!checkBlueprint', '!checkBlueprintLevel',
|
|
'!clearChat', '!clearFurnace', '!consume', '!craftable', '!discard',
|
|
'!endGoal', '!entities', '!equip', '!followPlayer', '!getBlueprint', '!getBlueprintLevel',
|
|
'!goToBed', '!help', '!modes', '!moveAway', '!newAction', '!placeHere', '!putInChest',
|
|
'!restart', '!setMode', '!stay', '!stfu', '!stop'
|
|
]
|
|
BLOCKED_ACTIONS_CRAFTING = [
|
|
'!activate', '!attack', '!attackPlayer', '!checkBlueprint', '!checkBlueprintLevel',
|
|
'!clearChat', '!clearFurnace', '!consume', '!craftable', '!discard', '!endConversation',
|
|
'!endGoal', '!entities', '!followPlayer', '!getBlueprint', '!getBlueprintLevel',
|
|
'!goToBed', '!help', '!modes', '!newAction', '!putInChest', '!restart',
|
|
'!searchForEntity', '!setMode', '!stay', '!stfu', '!stop', '!takeFromChest',
|
|
'!viewChest'
|
|
]
|
|
BLOCKED_ACTIONS_CONSTRUCTION = [
|
|
'!activate', '!attackPlayer', '!clearChat', '!clearFurnace', '!collectBlocks',
|
|
'!consume', '!craftable', '!discard', '!endConversation', '!endGoal', '!entities',
|
|
'!equip', '!followPlayer', '!getBlueprint', '!getBlueprintLevel', '!goToBed',
|
|
'!help', '!modes', '!moveAway', '!newAction', '!placeHere', '!putInChest',
|
|
'!restart', '!searchForBlock', '!searchForEntity', '!setMode', '!stay', '!stfu',
|
|
'!stop', '!takeFromChest', '!viewChest', '!craftRecipe', '!smeltItem'
|
|
]
|
|
|
|
def launch_parallel_experiments(task_path: str,
|
|
num_exp: int,
|
|
exp_name: str,
|
|
num_agents: int = 2,
|
|
model: str = "gpt-4o-mini",
|
|
api: str = "openai",
|
|
num_parallel: int = 1,
|
|
s3: bool = False,
|
|
bucket_name: str = "mindcraft-experiments",
|
|
template_profile: str = "profiles/tasks/collab_profile.json",
|
|
insecure_coding: bool = False,
|
|
url: str = "http://127.0.0.1:8000/v1",
|
|
max_messages: int = 15,
|
|
num_examples: int = 2,
|
|
no_pruning: bool = False,
|
|
block_conversation: bool = False,
|
|
run_in_tmux: bool = True) -> None:
|
|
"""
|
|
Orchestrates the launch of parallel experiments and monitors their progress.
|
|
|
|
This function splits tasks among a specified number of parallel servers,
|
|
launches them, and then enters a monitoring loop. It periodically checks
|
|
the experiment folder, aggregates results, prints progress, and uploads
|
|
to S3 if configured.
|
|
|
|
Args:
|
|
task_path (str): Path to the task definition file.
|
|
num_exp (int): Number of times to repeat each task.
|
|
exp_name (str): A unique name for this experiment run.
|
|
num_agents (int): The number of agents to use per task.
|
|
model (str): The model name to be used by the agents.
|
|
api (str): The API provider for the model.
|
|
num_parallel (int): The number of parallel servers/experiments to run.
|
|
s3 (bool): If True, upload results to S3.
|
|
bucket_name (str): The S3 bucket to use for uploads.
|
|
template_profile (str): Path to the agent profile template.
|
|
insecure_coding (bool): If True, enables insecure coding mode.
|
|
url (str): The URL for the model API (if applicable).
|
|
max_messages (int): Maximum number of messages before summarization.
|
|
num_examples (int): Number of examples to use in the prompt.
|
|
no_pruning (bool): If True, disables action pruning.
|
|
block_conversation (bool): If True, blocks conversation actions.
|
|
run_in_tmux (bool): If True, runs servers and scripts in tmux sessions.
|
|
"""
|
|
|
|
with open(task_path, 'r', encoding='utf-8') as file:
|
|
content = file.read()
|
|
json_data = json.loads(content)
|
|
|
|
task_ids = json_data.keys()
|
|
|
|
task_type = json_data[list(task_ids)[0]]["type"]
|
|
# split the task_ids into num_parallel groups
|
|
task_ids = list(task_ids)
|
|
task_ids_split = [task_ids[i::num_parallel] for i in range(num_parallel)]
|
|
|
|
if task_type == "cooking":
|
|
world_name = "Superflat"
|
|
elif task_type == "techtree":
|
|
world_name = "Forest"
|
|
elif task_type == "construction":
|
|
world_name = "Superflat"
|
|
else:
|
|
world_name = "Forest" # Default fallback
|
|
|
|
if run_in_tmux:
|
|
servers = create_server_files("./tasks/server_data/", num_parallel, world_name=world_name)
|
|
else:
|
|
servers = [(f"./tasks/server_data_{i}/", 55916 + i) for i in range(num_parallel)]
|
|
date_time = datetime.now().strftime("%m-%d_%H-%M")
|
|
experiments_folder = f"experiments/{exp_name}_{date_time}"
|
|
exp_name = f"{exp_name}_{date_time}"
|
|
|
|
split_task_path = task_path.split("/")
|
|
if len(split_task_path) > 1:
|
|
task_path_name = split_task_path[-2]
|
|
else:
|
|
task_path_name = "tasks"
|
|
|
|
s3_path = f"{bucket_name}/{task_type}/{model}/{task_path_name}/{exp_name}"
|
|
|
|
# start experiments
|
|
os.makedirs(experiments_folder, exist_ok=True)
|
|
for i, server in enumerate(servers):
|
|
launch_server_experiment(task_path,
|
|
task_ids_split[i],
|
|
num_exp,
|
|
server,
|
|
experiments_folder,
|
|
exp_name,
|
|
s3=s3,
|
|
bucket_name=bucket_name,
|
|
template_profile=template_profile,
|
|
model=model,
|
|
api=api,
|
|
insecure_coding=insecure_coding,
|
|
num_agents=num_agents,
|
|
url=url,
|
|
task_type=task_type,
|
|
s3_path=s3_path,
|
|
max_messages=max_messages,
|
|
num_examples=num_examples,
|
|
no_pruning=no_pruning,
|
|
block_conversation=block_conversation,
|
|
run_in_tmux=run_in_tmux)
|
|
time.sleep(5)
|
|
|
|
total_num_tasks = len(task_ids)
|
|
total_num_experiments = total_num_tasks * num_exp
|
|
total_run = 0
|
|
|
|
with open(task_path, 'r') as f:
|
|
task_definitions = json.load(f)
|
|
|
|
while total_run < total_num_experiments:
|
|
# Get all subfolders that have been created
|
|
try:
|
|
evaluated_folders = [f.path for f in os.scandir(experiments_folder) if f.is_dir()]
|
|
except FileNotFoundError:
|
|
evaluated_folders = []
|
|
|
|
if not evaluated_folders:
|
|
logging.info("No experiment folders found yet. Waiting...")
|
|
time.sleep(60)
|
|
continue
|
|
|
|
results_df = aggregate_results(evaluated_folders, task_definitions)
|
|
|
|
if results_df.empty:
|
|
total_run = 0
|
|
success_rate = 0.0
|
|
status_dist_str = "No results yet."
|
|
else:
|
|
total_run = len(results_df)
|
|
success_rate = results_df['overall_is_successful'].mean()
|
|
status_dist = results_df['overall_completion_status'].value_counts(normalize=True).to_dict()
|
|
status_dist_str = ", ".join([f"{k}: {v:.2%}" for k, v in status_dist.items()])
|
|
|
|
logging.info(f"\n--- Progress Update ({datetime.now().strftime('%H:%M:%S')}) ---")
|
|
logging.info(f"Total tasks run: {total_run}/{total_num_experiments}")
|
|
logging.info(f"Overall Success Rate: {success_rate:.2%}")
|
|
logging.info(f"Completion Status: {status_dist_str}")
|
|
|
|
# Create a summary dictionary for logging
|
|
results_summary = {
|
|
"total_evaluated": total_run,
|
|
"success_rate": success_rate,
|
|
"completion_status_distribution": status_dist,
|
|
"exp_name": exp_name,
|
|
"template_profile": template_profile,
|
|
"model": model,
|
|
"api": api,
|
|
"num_agents": num_agents,
|
|
"task_path": task_path,
|
|
"task_type": task_type,
|
|
"max_messages": max_messages,
|
|
"num_examples": num_examples
|
|
}
|
|
|
|
# Save summary and detailed results
|
|
with open(f"{experiments_folder}/results.json", "w") as f:
|
|
json.dump(results_summary, f, indent=4, default=str)
|
|
if not results_df.empty:
|
|
results_df.to_csv(f"{experiments_folder}/detailed_results.csv", index=False)
|
|
|
|
if s3:
|
|
cmd_results = f"aws s3 cp {experiments_folder}/results.json s3://{s3_path}/results.json"
|
|
logging.info(cmd_results)
|
|
subprocess.run(cmd_results.split(), capture_output=True, text=True)
|
|
if not results_df.empty:
|
|
cmd_csv = f"aws s3 cp {experiments_folder}/detailed_results.csv s3://{s3_path}/detailed_results.csv"
|
|
logging.info(cmd_csv)
|
|
subprocess.run(cmd_csv.split(), capture_output=True, text=True)
|
|
|
|
time.sleep(60)
|
|
|
|
def launch_server_experiment(task_path: str,
|
|
task_ids: List[str],
|
|
num_exp: int,
|
|
server: Tuple[str, int],
|
|
experiments_folder: str,
|
|
exp_name: str = "exp",
|
|
num_agents: int = 2,
|
|
model: str = "gpt-4o",
|
|
api: str = "openai",
|
|
s3: bool = False,
|
|
bucket_name: str = "mindcraft-experiments",
|
|
template_profile: str = "profiles/tasks/collab_profile.json",
|
|
insecure_coding: bool = False,
|
|
url: str = "http://127.0.0.1:8000/v1",
|
|
task_type: str = "techtree",
|
|
s3_path: str = "",
|
|
max_messages: int = 15,
|
|
num_examples: int = 2,
|
|
no_pruning: bool = False,
|
|
block_conversation: bool = False,
|
|
run_in_tmux: bool = True) -> None:
|
|
"""
|
|
Launches and configures a single server instance for running experiments.
|
|
|
|
This function handles the setup for one of the parallel experiment instances.
|
|
It configures server properties, creates agent profiles, sets up tmux sessions
|
|
(if enabled), and generates the script that will run the tasks.
|
|
|
|
Args:
|
|
task_path (str): Path to the task definition file.
|
|
task_ids (List[str]): The specific task IDs this server will run.
|
|
num_exp (int): Number of times to repeat each task.
|
|
server (Tuple[str, int]): A tuple containing the server's path and port.
|
|
experiments_folder (str): The root folder for storing experiment results.
|
|
exp_name (str): The name of the experiment.
|
|
num_agents (int): The number of agents to use.
|
|
model (str): The model name to use.
|
|
api (str): The API provider for the model.
|
|
s3 (bool): If True, enable S3 uploads.
|
|
bucket_name (str): The name of the S3 bucket.
|
|
template_profile (str): Path to the agent profile template.
|
|
insecure_coding (bool): If True, enable insecure coding mode.
|
|
url (str): The URL for the model API.
|
|
task_type (str): The type of task being run.
|
|
s3_path (str): The base S3 path for uploads.
|
|
max_messages (int): Maximum messages before summarization.
|
|
num_examples (int): Number of examples for the prompt.
|
|
no_pruning (bool): If True, disable action pruning.
|
|
block_conversation (bool): If True, block conversation actions.
|
|
run_in_tmux (bool): If True, run in a tmux session.
|
|
"""
|
|
server_path, server_port = server
|
|
edit_file(os.path.join(server_path, "server.properties"), {"server-port": server_port})
|
|
mindserver_port = server_port - 55916 + 8080
|
|
|
|
# set up server and agents
|
|
session_name = str(server_port - 55916)
|
|
if num_agents == 1:
|
|
agent_names = [f"Andy_{session_name}"]
|
|
models = [model]
|
|
apis = [api]
|
|
elif num_agents == 2:
|
|
agent_names = [f"Andy_{session_name}", f"Jill_{session_name}"]
|
|
models = [model] * 2
|
|
apis = [api] * 2
|
|
else:
|
|
# Lets use an ordered list of 10 human names.
|
|
human_names = ["Andy", "Jill", "Bob", "Sally", "Mike", "Laura", "John", "Emma", "Tom", "Kate"]
|
|
agent_names = []
|
|
for i in range(num_agents):
|
|
name = human_names[i % len(human_names)]
|
|
agent_names.append(f"{name}_{session_name}")
|
|
models = [model] * num_agents
|
|
apis = [api] * num_agents
|
|
|
|
make_profiles(agent_names, models, apis, template_profile=template_profile, url=url)
|
|
|
|
agent_profiles = [f"./{agent}.json" for agent in agent_names]
|
|
|
|
if num_agents == 1:
|
|
agent_profiles_str = f"'[\"{agent_profiles[0]}\"]'"
|
|
elif num_agents == 2:
|
|
agent_profiles_str = f"'[\"{agent_profiles[0]}\", \"{agent_profiles[1]}\"]'"
|
|
else:
|
|
agent_profiles_str = "'["
|
|
for agent in agent_profiles[:-1]:
|
|
agent_profiles_str += f'\"{agent}\", '
|
|
agent_profiles_str += f"\"{agent_profiles[-1]}\"]'"
|
|
logging.info(agent_profiles_str)
|
|
|
|
if run_in_tmux:
|
|
logging.info("run in tmux is true")
|
|
launch_world(server_path, session_name="server_" + session_name, port=server_port)
|
|
subprocess.run(['tmux', 'new-session', '-d', '-s', session_name], check=True)
|
|
|
|
# set environment variables
|
|
if run_in_tmux:
|
|
set_environment_variable_tmux_session(session_name, "MINECRAFT_PORT", server_port)
|
|
set_environment_variable_tmux_session(session_name, "MINDSERVER_PORT", mindserver_port)
|
|
set_environment_variable_tmux_session(session_name, "PROFILES", agent_profiles_str)
|
|
set_environment_variable_tmux_session(session_name, "MAX_MESSAGES", str(max_messages))
|
|
set_environment_variable_tmux_session(session_name, "NUM_EXAMPLES", str(num_examples))
|
|
set_environment_variable_tmux_session(session_name, "LOG_ALL", "true")
|
|
if insecure_coding:
|
|
set_environment_variable_tmux_session(session_name, "INSECURE_CODING", "true")
|
|
make_ops(agent_names, session_name)
|
|
else:
|
|
agent_profiles_str = "["
|
|
for agent in agent_profiles[:-1]:
|
|
agent_profiles_str += f"\"{agent}\", "
|
|
agent_profiles_str += f"\"{agent_profiles[-1]}\"]"
|
|
logging.debug(agent_profiles_str)
|
|
os.environ["PROFILES"] = agent_profiles_str
|
|
os.environ["MAX_MESSAGES"] = str(max_messages)
|
|
os.environ["NUM_EXAMPLES"] = str(num_examples)
|
|
os.environ["LOG_ALL"] = "true"
|
|
|
|
run_script(task_path,
|
|
task_ids,
|
|
num_exp,
|
|
experiments_folder,
|
|
agent_names,
|
|
server_path,
|
|
s3=s3,
|
|
s3_path=s3_path,
|
|
session_name=session_name,
|
|
run_in_tmux=run_in_tmux)
|
|
|
|
def run_script(task_path: str,
|
|
task_ids: List[str],
|
|
num_exp: int,
|
|
experiments_folder: str,
|
|
agent_names: List[str],
|
|
server_path: str,
|
|
s3: bool = False,
|
|
s3_path: str = "mindcraft-experiments",
|
|
session_name: str = "0",
|
|
run_in_tmux: bool = True) -> None:
|
|
"""
|
|
Generates and executes a shell script to run a sequence of tasks.
|
|
|
|
This function creates a shell script that contains the `node main.js` commands
|
|
to run each task, along with commands to copy the resulting log files to the
|
|
correct experiment folder and upload them to S3 if enabled.
|
|
|
|
Args:
|
|
task_path (str): Path to the task definition file.
|
|
task_ids (List[str]): The list of task IDs to be run.
|
|
num_exp (int): The number of times to repeat each task.
|
|
experiments_folder (str): The root folder for storing results.
|
|
agent_names (List[str]): The names of the agents participating.
|
|
server_path (str): The path to the server directory.
|
|
s3 (bool): If True, include S3 upload commands in the script.
|
|
s3_path (str): The base S3 path for uploads.
|
|
session_name (str): The tmux session name to run the script in.
|
|
run_in_tmux (bool): If True, execute the script via tmux.
|
|
"""
|
|
script_content = ""
|
|
for task_id in task_ids:
|
|
# Create a separate folder for each task_id
|
|
task_folder = os.path.join(experiments_folder, str(task_id))
|
|
os.makedirs(task_folder, exist_ok=True)
|
|
assert os.path.exists(task_folder), f"Directory {task_folder} was not created"
|
|
logging.info(f"Created directory: {task_folder}")
|
|
|
|
cmd = f"node main.js --task_path \'{task_path}\' --task_id {task_id}"
|
|
for _ in range(num_exp):
|
|
script_content += f"{cmd}\n"
|
|
script_content += "sleep 2\n"
|
|
for agent in agent_names:
|
|
agent_file_path = os.path.join(task_folder, f"{agent}_{_}.json")
|
|
script_content += f"echo 'Saving to {agent_file_path}'\n"
|
|
cp_cmd = f"cp bots/{agent}/memory.json {agent_file_path}"
|
|
script_content += f"echo '{cp_cmd}'\n"
|
|
script_content += f"{cp_cmd}\n"
|
|
script_content += "sleep 1\n"
|
|
if s3:
|
|
s3_cmd = f"aws s3 cp {agent_file_path} s3://{s3_path}/{task_id}/{agent}_{_}.json"
|
|
script_content += f"echo 'Uploading {agent_file_path} to S3'\n"
|
|
script_content += f"echo '{s3_cmd}'\n"
|
|
script_content += f"{s3_cmd}\n"
|
|
script_content += "sleep 1\n"
|
|
script_content += f"sleep 10\n"
|
|
if s3:
|
|
for agent in agent_names:
|
|
script_content += f"aws s3 cp bots/{agent} s3://{s3_path}/bots/{agent} --recursive\n"
|
|
|
|
# Create a temporary shell script file
|
|
script_file = f"./tmp/experiment_script_{session_name}.sh"
|
|
make_script_file_and_run(script_content, script_file, session_name=session_name, run_in_tmux=run_in_tmux)
|
|
|
|
def main() -> None:
|
|
"""
|
|
Main entry point for the evaluation script.
|
|
|
|
Parses command-line arguments and orchestrates the experiment launch or
|
|
results-checking process.
|
|
"""
|
|
parser = argparse.ArgumentParser(description='Run Minecraft AI agent experiments')
|
|
parser.add_argument('--no_launch_world', action='store_true', help='Do not launch the Minecraft world')
|
|
parser.add_argument('--task_path', default="tasks/multiagent_crafting_tasks.json", help='Path to the task file')
|
|
parser.add_argument('--num_agents', default=2, type=int, help='Number of agents to run')
|
|
parser.add_argument('--num_exp', default=1, type=int, help='Number of experiments to run')
|
|
parser.add_argument('--num_parallel', default=1, type=int, help='Number of parallel servers to run')
|
|
parser.add_argument('--exp_name', default="exp", help='Name of the experiment')
|
|
parser.add_argument('--s3', action='store_true', help='Whether to upload to s3')
|
|
parser.add_argument('--bucket_name', default="mindcraft-experiments", help='Name of the s3 bucket')
|
|
parser.add_argument('--add_keys', action='store_true', help='Create the keys.json to match the environment variables')
|
|
parser.add_argument('--template_profile', default="profiles/tasks/crafting_profile.json", help='Model to use for the agents')
|
|
parser.add_argument('--model', default="gpt-4o-mini", help='Model to use for the agents')
|
|
parser.add_argument('--api', default="openai", help='API to use for the agents')
|
|
parser.add_argument('--insecure_coding', action='store_true', help='Enable insecure coding')
|
|
parser.add_argument('--url', default="http://127.0.0.1:8000/v1")
|
|
parser.add_argument('--max_messages', default=15, type=int, help='Maximum number of messages before summarizing')
|
|
parser.add_argument('--num_examples', default=2, type=int, help='Maximum number of turns before summarizing')
|
|
parser.add_argument('--no-pruning', action='store_true', help='Disable pruning of the actions')
|
|
parser.add_argument('--block_conversation', action='store_true', help='Block conversation actions')
|
|
parser.add_argument('--check', metavar='FOLDER_PATH', help='Check and evaluate results in the specified folder without running experiments')
|
|
parser.add_argument('--usernames', default="", help='Comma-separated list of usernames for the agents')
|
|
|
|
args = parser.parse_args()
|
|
logging.info(args)
|
|
|
|
# If --check flag is provided, evaluate results in the specified folder and exit
|
|
if args.check:
|
|
# The check function now also requires the task definition file.
|
|
check_folder_results(args.check, args.task_path)
|
|
return
|
|
|
|
if not args.no_launch_world:
|
|
try:
|
|
subprocess.run(['tmux', 'kill-server'], check=True)
|
|
except subprocess.CalledProcessError:
|
|
logging.info("No tmux session to kill")
|
|
|
|
# delete all server files
|
|
if not args.no_launch_world:
|
|
clean_up_server_files(args.num_parallel)
|
|
if args.add_keys:
|
|
update_keys_json()
|
|
|
|
# change task file to include usernames
|
|
with open(args.task_path, 'r') as f:
|
|
content = f.read()
|
|
task = json.loads(content)
|
|
# check if human count for first task is non zero
|
|
if "human_count" in task[list(task.keys())[0]]:
|
|
# check if human count is non zero
|
|
human_count = task[list(task.keys())[0]]["human_count"]
|
|
username_lst = args.usernames.replace(" ", "").split(",")
|
|
if len(username_lst) != human_count:
|
|
raise ValueError(f"Number of usernames provided ({len(username_lst)}) does not match human count ({human_count})")
|
|
if human_count > 0:
|
|
for task_id in task.keys():
|
|
task[task_id]["usernames"] = username_lst
|
|
# dump to task_path
|
|
with open(args.task_path, 'w') as f:
|
|
json.dump(task, f, indent=4)
|
|
|
|
launch_parallel_experiments(args.task_path,
|
|
num_exp=args.num_exp,
|
|
exp_name=args.exp_name,
|
|
num_parallel=args.num_parallel,
|
|
s3=args.s3,
|
|
bucket_name=args.bucket_name,
|
|
template_profile=args.template_profile,
|
|
model=args.model,
|
|
api=args.api,
|
|
insecure_coding=args.insecure_coding,
|
|
num_agents=args.num_agents,
|
|
url=args.url,
|
|
max_messages=args.max_messages,
|
|
num_examples=args.num_examples,
|
|
no_pruning=args.no_pruning,
|
|
block_conversation=args.block_conversation,
|
|
run_in_tmux=not args.no_launch_world)
|
|
|
|
if __name__ == "__main__":
|
|
main() |