This commit is contained in:
Isadora White 2025-03-23 11:31:08 -05:00
commit db962d722a
5 changed files with 2356 additions and 33 deletions

View file

@ -1,39 +1,89 @@
import boto3
import os import os
import json import json
from collections import defaultdict
from prettytable import PrettyTable
import re import re
from botocore.exceptions import ClientError
import json
import argparse
from tqdm import tqdm
import glob
def extract_success_scores(root_dir):
task_scores = {} # Stores task-wise scores
material_groups = defaultdict(list)
room_groups = defaultdict(list)
def analyze_json_file(file_path): # Regex pattern to extract material and room numbers
""" pattern = re.compile(r"materials_(\d+)_rooms_(\d+)")
Analyzes a single JSON file to extract the task outcome.
Args: # Iterate through each task folder
file_path (str): Path to the JSON file. for task_folder in os.listdir(root_dir):
task_path = os.path.join(root_dir, task_folder)
if os.path.isdir(task_path):
logs_found = False # Flag to track if logs exist
Returns: # Check for JSON files
str or None: The task outcome string if found, otherwise None. for file_name in os.listdir(task_path):
""" if file_name.endswith(".json"):
try: logs_found = True # JSON file exists
with open(file_path, 'r') as f: file_path = os.path.join(task_path, file_name)
data = json.load(f)
if 'turns' in data and isinstance(data['turns'], list): # Read JSON file
for turn in reversed(data['turns']): # Check turns from the end try:
if turn.get('role') == 'system' and isinstance(turn.get('content'), str): with open(file_path, 'r') as file:
if "Task successful ended with code : 2" in turn['content'] or "Task ended in score: 1" in turn["content"]: data = json.load(file)
return True
return False # Extract success score from the last system message
except FileNotFoundError: for turn in reversed(data.get("turns", [])):
print(f"Error: File not found: {file_path}") if turn["role"] == "system" and "Task ended with score" in turn["content"]:
return None score = float(turn["content"].split(":")[-1].strip())
except json.JSONDecodeError: task_scores[task_folder] = score # Store per-task score
print(f"Error: Invalid JSON format in: {file_path}") break # Stop searching if found
return None
except Exception as e: # Stop checking other files in the folder if score is found
print(f"An unexpected error occurred while processing {file_path}: {e}") if task_folder in task_scores:
return None break
except Exception as e:
print(f"Error reading {file_path}: {e}")
# If no logs were found, print a message
if not logs_found:
print(f"No log files found in {task_folder}")
# Group scores by material and room
for task, score in task_scores.items():
match = pattern.search(task)
if match:
material = int(match.group(1)) # Extract material number
room = int(match.group(2)) # Extract room number
material_groups[material].append(score)
room_groups[room].append(score)
else:
print(f"Warning: Task folder '{task}' does not match expected format.")
# Calculate average scores
def calculate_average(group):
return {key: sum(values) / len(values) for key, values in group.items()}
avg_material_scores = calculate_average(material_groups)
avg_room_scores = calculate_average(room_groups)
# Display results using PrettyTable
def display_table(title, data):
table = PrettyTable(["Category", "Average Score"])
for key, value in sorted(data.items()):
table.add_row([key, round(value, 2)])
print(f"\n{title}")
print(table)
def display_task_scores():
table = PrettyTable(["Task", "Success Score"])
for task, score in sorted(task_scores.items()):
table.add_row([task, round(score, 2)])
print("\nTask-wise Success Scores")
print(table)
# Print all tables
display_task_scores()
display_table("Average Success Score by Material (Grouped by Number)", avg_material_scores)
display_table("Average Success Score by Room (Grouped by Number)", avg_room_scores)
# Example usage (replace 'root_directory' with actual path)
root_directory = "experiments/exp_03-22_19-29"
extract_success_scores(root_directory)

View file

@ -1,7 +1,7 @@
{ {
"name": "andy", "name": "andy",
"model": "gpt-4o", "model": "gpt-4o-mini",
"conversing": "You are a task-focused Minecraft bot named $NAME. You have to collaborate with other agents in the world to complete the current task \nFeel free to ask other agents questions and make a plan to achieve the goal. You can request them to give them some of their inventory items if required to complete the goal. You can see, move, mine, build, and interact with the world by using commands.\n$SELF_PROMPT Act human-like as if you were a typical Minecraft player, rather than an AI. Be very brief in your responses, don't apologize constantly, don't give instructions or make lists unless asked, and don't refuse requests. Don't pretend to act, use commands immediately when requested. Do NOT say this: 'Sure, I've stopped.', instead say this: 'Sure, I'll stop. !stop'. Do NOT say this: 'On my way! Give me a moment.', instead say this: 'On my way! !goToPlayer(\"playername\", 3)'. Respond only as $NAME, never output '(FROM OTHER BOT)' or pretend to be someone else. If you have nothing to say or do, respond with an just a tab '\t'. Share resources and information with other bots! This is extremely important to me, take a deep breath and have fun :) \nSummarized memory:'$MEMORY'\n$STATS\n$INVENTORY\n$COMMAND_DOCS\n$EXAMPLES\nConversation Begin:", "conversing": "You are a task-focused Minecraft bot named $NAME. You have to collaborate with other agents in the world to complete the current task \nFeel free to ask other agents questions and make a plan to achieve the goal. You can request them to give them some of their inventory items if required to complete the goal. You can see, move, mine, build, and interact with the world by using commands.\n$SELF_PROMPT Act human-like as if you were a typical Minecraft player, rather than an AI. Be very brief in your responses, don't apologize constantly, don't give instructions or make lists unless asked, and don't refuse requests. Don't pretend to act, use commands immediately when requested. Do NOT say this: 'Sure, I've stopped.', instead say this: 'Sure, I'll stop. !stop'. Do NOT say this: 'On my way! Give me a moment.', instead say this: 'On my way! !goToPlayer(\"playername\", 3)'. Respond only as $NAME, never output '(FROM OTHER BOT)' or pretend to be someone else. If you have nothing to say or do, respond with an just a tab '\t'. Share resources and information with other bots! This is extremely important to me, take a deep breath and have fun :) \nSummarized memory:'$MEMORY'\n$STATS\n$INVENTORY\n$COMMAND_DOCS\n$EXAMPLES\nConversation Begin:",

View file

@ -0,0 +1,241 @@
import json
import re
import statistics
import random
import os
def extract_difficulty(task_name):
"""Extract difficulty parameters from the task name."""
match = re.search(r'materials_(\d+)_rooms_(\d+)_window_(\d+)_carpet_(\d+)_variant_\d+', task_name)
if match:
return tuple(map(int, match.groups())) # (m, r, w, c)
return (0, 0, 0, 0) # Default to lowest difficulty if not found
def calculate_difficulty_score(task_name, task, alpha=1.0, beta=3.0):
"""Compute a difficulty score based on parameters."""
m, r, w, c = extract_difficulty(task_name)
num_levels = len(task.get("blueprint", {}).get("levels", []))
# Higher values mean more difficulty
score = (m*4 + r*10 + w*2 + c*1)
return score
def process_json(file_path, output_path, alpha=1.0, beta=3.0):
"""Process the JSON file to count tasks, quantify difficulty, and filter easiest 30."""
with open(file_path, 'r') as f:
data = json.load(f)
# Count total tasks
total_tasks = len(data)
print(f"Total tasks: {total_tasks}")
# Compute difficulty scores for tasks with at least 3 levels
task_difficulties = []
filtered_out = 0
for task_name, task_details in data.items():
num_levels = len(task_details.get("blueprint", {}).get("levels", []))
# Skip tasks with fewer than 3 levels
if num_levels < 3:
filtered_out += 1
continue
score = calculate_difficulty_score(task_name, task_details, alpha, beta)
task_difficulties.append((task_name, score, task_details))
print(f"Filtered out {filtered_out} tasks with fewer than 3 levels")
print(f"Remaining tasks after filtering: {len(task_difficulties)}")
# Calculate statistics on the filtered tasks
if task_difficulties:
difficulty_scores = [score for _, score, _ in task_difficulties]
stats = {
"mean": statistics.mean(difficulty_scores),
"median": statistics.median(difficulty_scores),
"min": min(difficulty_scores),
"max": max(difficulty_scores),
}
print(f"Difficulty Statistics for Overall Tasks: {stats}")
else:
stats = {"mean": 0, "median": 0, "min": 0, "max": 0}
print("No tasks remaining after filtering!")
# Sort tasks by difficulty (ascending)
task_difficulties.sort(key=lambda x: x[1])
# Get the 30 easiest tasks (or all if less than 30)
num_tasks_to_select = min(30, len(task_difficulties))
easiest_tasks = {task[0]: task[2] for task in task_difficulties[:num_tasks_to_select]}
# Difficulty scores of the easiest tasks
easiest_difficulty_scores = [score for _, score, _ in task_difficulties[:num_tasks_to_select]]
easiest_stats = {
"mean": statistics.mean(easiest_difficulty_scores),
"median": statistics.median(easiest_difficulty_scores),
"min": min(easiest_difficulty_scores),
"max": max(easiest_difficulty_scores),
}
print(f"Difficulty Statistics for Easiest Tasks: {easiest_stats}")
# Add a group by of all unique (m, r, w, c) combinations in the easiest tasks
unique_difficulties = {}
for task_name, _, task_details in task_difficulties[:num_tasks_to_select]:
m, r, w, c = extract_difficulty(task_name)
unique_difficulties[(m, r, w, c)] = unique_difficulties.get((m, r, w, c), 0) + 1
print(f"Unique (m, r, w, c) combinations in the easiest tasks:")
for difficulty, count in unique_difficulties.items():
print(f" {difficulty}: {count} tasks")
# Add statistics to output
output_data = easiest_tasks
# Save to output file
with open(output_path, 'w') as f:
json.dump(output_data, f, indent=4)
print(f"Saved {num_tasks_to_select} easiest tasks with statistics to {output_path}")
def sample_tasks_with_distribution(file_path, output_path):
"""
Sample tasks with a specific distribution:
- 3 tasks for each of the 9 possibilities of (m,r) where 0 <= m <= 2 and 0 <= r <= 2
- Random (w,c) between 0 and 1 for the above tasks
- 2 additional tasks from (m,r,w,c) = (0,0,0,0)
- 1 additional task from (m,r,w,c) = (1,0,0,0)
"""
with open(file_path, 'r') as f:
data = json.load(f)
# Filter tasks with at least 3 levels
valid_tasks = {}
for task_name, task_details in data.items():
num_levels = len(task_details.get("blueprint", {}).get("levels", []))
if num_levels >= 3:
valid_tasks[task_name] = task_details
# print(f"Total valid tasks: {len(valid_tasks)}")
# Categorize tasks by their (m,r,w,c) values
tasks_by_params = {}
for task_name, task_details in valid_tasks.items():
m, r, w, c = extract_difficulty(task_name)
key = (m, r, w, c)
if key not in tasks_by_params:
tasks_by_params[key] = []
tasks_by_params[key].append((task_name, task_details))
# # Print available combinations
# print("Available (m,r,w,c) combinations:")
# for params, tasks in tasks_by_params.items():
# print(f" {params}: {len(tasks)} tasks")
# Sample tasks according to the distribution
sampled_tasks = {}
already_sampled = set()
# 1. Sample 3 tasks for each (m,r) where 0 <= m <= 2 and 0 <= r <= 2
for m in range(3):
for r in range(3):
# Find all tasks with the current (m,r) and w,c between 0 and 1
candidates = []
for params, tasks in tasks_by_params.items():
if params[0] == m and params[1] == r and params[2] <= 1 and params[3] <= 1:
candidates.extend(tasks)
# Sample 3 tasks if possible
if len(candidates) >= 3:
sampled = random.sample(candidates, 3)
for task_name, task_details in sampled:
if task_name not in already_sampled:
sampled_tasks[task_name] = task_details
already_sampled.add(task_name)
else:
print(f"Warning: Not enough tasks for (m={m}, r={r}) with w,c <= 1. Found {len(candidates)}.")
# Add all available
for task_name, task_details in candidates:
if task_name not in already_sampled:
sampled_tasks[task_name] = task_details
already_sampled.add(task_name)
# 2. Add 2 tasks with (m,r,w,c) = (0,0,0,0)
zero_zero_zero_zero = tasks_by_params.get((0,0,0,0), [])
zero_zero_zero_zero = [t for t in zero_zero_zero_zero if t[0] not in already_sampled]
if len(zero_zero_zero_zero) >= 2:
additional = random.sample(zero_zero_zero_zero, 2)
for task_name, task_details in additional:
sampled_tasks[task_name] = task_details
already_sampled.add(task_name)
else:
print(f"Warning: Not enough tasks for (0,0,0,0). Found {len(zero_zero_zero_zero)}.")
for task_name, task_details in zero_zero_zero_zero:
sampled_tasks[task_name] = task_details
already_sampled.add(task_name)
# 3. Add 1 task with (m,r,w,c) = (1,0,0,0)
one_zero_zero_zero = tasks_by_params.get((1,0,0,0), [])
one_zero_zero_zero = [t for t in one_zero_zero_zero if t[0] not in already_sampled]
if len(one_zero_zero_zero) >= 1:
additional = random.sample(one_zero_zero_zero, 1)
for task_name, task_details in additional:
sampled_tasks[task_name] = task_details
already_sampled.add(task_name)
else:
print(f"Warning: Not enough tasks for (1,0,0,0). Found {len(one_zero_zero_zero)}.")
for task_name, task_details in one_zero_zero_zero:
sampled_tasks[task_name] = task_details
already_sampled.add(task_name)
# Print summary of sampled tasks
print(f"\nTotal sampled tasks: {len(sampled_tasks)}")
# Count tasks by their (m,r) values
distribution = {}
for task_name in sampled_tasks:
m, r, w, c = extract_difficulty(task_name)
key = (m, r)
if key not in distribution:
distribution[key] = []
distribution[key].append((w, c))
print("\nDistribution of sampled tasks:")
for mr, wc_list in distribution.items():
print(f" (m={mr[0]}, r={mr[1]}): {len(wc_list)} tasks")
for wc in wc_list:
print(f" (w={wc[0]}, c={wc[1]})")
# Check for duplicates in sampled tasks
if len(sampled_tasks) != len(set(sampled_tasks.keys())):
print("\nWARNING: Duplicate tasks detected!")
# Find the duplicates
task_counts = {}
for task_name in sampled_tasks.keys():
task_counts[task_name] = task_counts.get(task_name, 0) + 1
duplicates = [task for task, count in task_counts.items() if count > 1]
print(f"Duplicate tasks: {duplicates}")
else:
print("\nVerification: No duplicates found in the sampled tasks.")
# Save to output file
with open(output_path, 'w') as f:
json.dump(sampled_tasks, f, indent=4)
print(f"\nSaved {len(sampled_tasks)} distributed tasks to {output_path}")
# Example usage:
# process_json('test/2agents.json', 'test/2_agents_easiest_tasks.json', alpha=1.0, beta=3.0)
# Iterate through files in tasks folder
tasks_dir = 'test'
for filename in os.listdir(tasks_dir):
if filename.endswith('agents.json'):
input_path = os.path.join(tasks_dir, filename)
# Create output filename by replacing .json with _distributed_tasks.json
output_filename = filename.replace('.json', '_distributed_tasks.json')
output_path = os.path.join(tasks_dir, output_filename)
print(f"\nProcessing {filename}...")
sample_tasks_with_distribution(input_path, output_path)

View file

@ -0,0 +1,87 @@
import json
import re
import random
import os
from collections import defaultdict
def extract_difficulty(task_name):
"""Extract difficulty parameters from the task name."""
match = re.search(r'materials_(\d+)_rooms_(\d+)_window_(\d+)_carpet_(\d+)_variant_\d+', task_name)
if match:
return tuple(map(int, match.groups())) # (m, r, w, c)
return (0, 0, 0, 0) # Default if not found
def filter_and_sample_tasks(file_path, output_path):
"""Filters, samples, and saves 500 unique tasks based on given criteria."""
with open(file_path, 'r') as f:
data = json.load(f)
total_tasks = len(data)
print(f"\nProcessing file: {file_path}")
print(f"Total available tasks: {total_tasks}")
valid_tasks = {}
# Filter tasks with at least 3 levels
for task_name, task_details in data.items():
num_levels = len(task_details.get("blueprint", {}).get("levels", []))
if num_levels >= 3:
valid_tasks[task_name] = task_details
print(f"Tasks with at least 3 levels: {len(valid_tasks)}")
# Organize tasks by difficulty parameters (m, r, w, c)
tasks_by_params = defaultdict(list)
for task_name, task_details in valid_tasks.items():
key = extract_difficulty(task_name)
tasks_by_params[key].append((task_name, task_details))
# Sort keys in increasing order
sorted_keys = sorted(tasks_by_params.keys())
sampled_tasks = {}
total_selected = 0
sampled_task_counts = defaultdict(int)
# Pick tasks sequentially until 500 are collected
for key in sorted_keys:
if total_selected >= 500:
break
if key in tasks_by_params:
candidates = tasks_by_params[key]
for task_name, task_details in candidates:
if total_selected < 500:
sampled_tasks[task_name] = task_details
sampled_task_counts[key] += 1 # Keep the key as a tuple
total_selected += 1
else:
break
print(f"\nTotal sampled tasks: {len(sampled_tasks)}")
# Print task count per (m, r, w, c) tuple
print("\nTask count per (m, r, w, c):")
for key, count in sorted(sampled_task_counts.items()):
print(f"{key}: {count}")
# Randomly shuffle the tasks before saving
shuffled_tasks = list(sampled_tasks.items())
random.shuffle(shuffled_tasks)
final_tasks = dict(shuffled_tasks)
# Save sampled tasks to JSON
with open(output_path, 'w') as f:
json.dump(final_tasks, f, indent=4)
print(f"\nSaved {len(final_tasks)} tasks to {output_path}")
# Process all relevant files
tasks_dir = 'train'
all_filenames = [f for f in os.listdir(tasks_dir) if f.endswith('agents.json')]
all_filenames.sort()
for i, filename in enumerate(all_filenames):
input_path = os.path.join(tasks_dir, filename)
output_filename = filename.replace('.json', '_sampled_tasks_for_training.json')
output_path = os.path.join(tasks_dir, output_filename)
filter_and_sample_tasks(input_path, output_path)

File diff suppressed because it is too large Load diff