mirror of
https://github.com/kolbytn/mindcraft.git
synced 2025-06-09 18:55:56 +02:00
Updated analyze scripts to perform model comparison
This commit is contained in:
parent
4c5320eddb
commit
76de807a46
2 changed files with 415 additions and 173 deletions
|
@ -4,86 +4,170 @@ from collections import defaultdict
|
||||||
from prettytable import PrettyTable
|
from prettytable import PrettyTable
|
||||||
import re
|
import re
|
||||||
|
|
||||||
def extract_success_scores(root_dir):
|
def extract_success_scores(folders, model_names):
|
||||||
task_scores = {} # Stores task-wise scores
|
assert len(folders) == len(model_names), "Folders and model names lists must have the same length."
|
||||||
material_groups = defaultdict(list)
|
|
||||||
room_groups = defaultdict(list)
|
all_task_scores = defaultdict(dict) # Stores task-wise scores per model
|
||||||
|
zero_score_tasks = defaultdict(list) # Stores tasks with 0 score per model
|
||||||
|
null_score_tasks = defaultdict(list) # Stores tasks with null score per model
|
||||||
|
material_groups = defaultdict(lambda: defaultdict(list))
|
||||||
|
room_groups = defaultdict(lambda: defaultdict(list))
|
||||||
|
material_room_groups = defaultdict(lambda: defaultdict(list))
|
||||||
|
overall_scores = defaultdict(list) # New dict to store all scores for each model
|
||||||
|
|
||||||
# Regex pattern to extract material and room numbers
|
|
||||||
pattern = re.compile(r"materials_(\d+)_rooms_(\d+)")
|
pattern = re.compile(r"materials_(\d+)_rooms_(\d+)")
|
||||||
|
|
||||||
# Iterate through each task folder
|
for root_dir, model_name in zip(folders, model_names):
|
||||||
for task_folder in os.listdir(root_dir):
|
for task_folder in os.listdir(root_dir):
|
||||||
task_path = os.path.join(root_dir, task_folder)
|
task_path = os.path.join(root_dir, task_folder)
|
||||||
if os.path.isdir(task_path):
|
if os.path.isdir(task_path):
|
||||||
logs_found = False # Flag to track if logs exist
|
logs_found = False
|
||||||
|
score_found = False
|
||||||
# Check for JSON files
|
|
||||||
for file_name in os.listdir(task_path):
|
for file_name in os.listdir(task_path):
|
||||||
if file_name.endswith(".json"):
|
if file_name.endswith(".json"):
|
||||||
logs_found = True # JSON file exists
|
logs_found = True
|
||||||
file_path = os.path.join(task_path, file_name)
|
file_path = os.path.join(task_path, file_name)
|
||||||
|
|
||||||
# Read JSON file
|
try:
|
||||||
try:
|
with open(file_path, 'r') as file:
|
||||||
with open(file_path, 'r') as file:
|
data = json.load(file)
|
||||||
data = json.load(file)
|
|
||||||
|
for turn in reversed(data.get("turns", [])):
|
||||||
# Extract success score from the last system message
|
if turn["role"] == "system" and "Task ended with score" in turn["content"]:
|
||||||
for turn in reversed(data.get("turns", [])):
|
score = float(turn["content"].split(":")[-1].strip())
|
||||||
if turn["role"] == "system" and "Task ended with score" in turn["content"]:
|
all_task_scores[task_folder][model_name] = score
|
||||||
score = float(turn["content"].split(":")[-1].strip())
|
overall_scores[model_name].append(score) # Add to overall scores
|
||||||
task_scores[task_folder] = score # Store per-task score
|
score_found = True
|
||||||
break # Stop searching if found
|
|
||||||
|
if score == 0:
|
||||||
# Stop checking other files in the folder if score is found
|
zero_score_tasks[model_name].append(task_folder)
|
||||||
if task_folder in task_scores:
|
break
|
||||||
|
|
||||||
|
if score_found:
|
||||||
break
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error reading {file_path}: {e}")
|
print(f"Error reading {file_path}: {e}")
|
||||||
|
|
||||||
# If no logs were found, print a message
|
if logs_found and not score_found:
|
||||||
if not logs_found:
|
# Score not found but logs exist - mark as null
|
||||||
print(f"No log files found in {task_folder}")
|
all_task_scores[task_folder][model_name] = None
|
||||||
|
null_score_tasks[model_name].append(task_folder)
|
||||||
# Group scores by material and room
|
|
||||||
for task, score in task_scores.items():
|
if not logs_found:
|
||||||
|
print(f"No log files found in {task_folder}")
|
||||||
|
|
||||||
|
# Calculate model completion rates (ignore null scores)
|
||||||
|
model_completion_rates = {}
|
||||||
|
for model_name in model_names:
|
||||||
|
valid_tasks = [task for task in all_task_scores.keys() if model_name in all_task_scores[task] and all_task_scores[task][model_name] is not None]
|
||||||
|
total_tasks = len(valid_tasks)
|
||||||
|
completed_tasks = len([task for task in valid_tasks if all_task_scores[task][model_name] > 0])
|
||||||
|
model_completion_rates[model_name] = (completed_tasks / total_tasks) if total_tasks > 0 else 0
|
||||||
|
|
||||||
|
# Process task scores into groups (ignore null and 0 scores)
|
||||||
|
for task, model_scores in all_task_scores.items():
|
||||||
match = pattern.search(task)
|
match = pattern.search(task)
|
||||||
if match:
|
if match:
|
||||||
material = int(match.group(1)) # Extract material number
|
material = int(match.group(1))
|
||||||
room = int(match.group(2)) # Extract room number
|
room = int(match.group(2))
|
||||||
material_groups[material].append(score)
|
|
||||||
room_groups[room].append(score)
|
for model, score in model_scores.items():
|
||||||
else:
|
if score is not None and score > 0: # Ignore null and 0 scores
|
||||||
print(f"Warning: Task folder '{task}' does not match expected format.")
|
material_groups[material][model].append(score)
|
||||||
|
room_groups[room][model].append(score)
|
||||||
# Calculate average scores
|
material_room_groups[(material, room)][model].append(score)
|
||||||
|
|
||||||
def calculate_average(group):
|
def calculate_average(group):
|
||||||
return {key: sum(values) / len(values) for key, values in group.items()}
|
return {key: {model: sum(scores) / len(scores) for model, scores in models.items() if scores}
|
||||||
|
for key, models in group.items() if models}
|
||||||
|
|
||||||
avg_material_scores = calculate_average(material_groups)
|
avg_material_scores = calculate_average(material_groups)
|
||||||
avg_room_scores = calculate_average(room_groups)
|
avg_room_scores = calculate_average(room_groups)
|
||||||
|
avg_material_room_scores = calculate_average(material_room_groups)
|
||||||
# Display results using PrettyTable
|
|
||||||
def display_table(title, data):
|
def display_table(title, data, tuple_keys=False):
|
||||||
table = PrettyTable(["Category", "Average Score"])
|
table = PrettyTable(["Category"] + model_names)
|
||||||
for key, value in sorted(data.items()):
|
for key, model_scores in sorted(data.items()):
|
||||||
table.add_row([key, round(value, 2)])
|
key_display = key if not tuple_keys else f"({key[0]}, {key[1]})"
|
||||||
|
row = [key_display] + [round(model_scores.get(model, 0), 2) for model in model_names]
|
||||||
|
table.add_row(row)
|
||||||
print(f"\n{title}")
|
print(f"\n{title}")
|
||||||
print(table)
|
print(table)
|
||||||
|
|
||||||
def display_task_scores():
|
def display_task_scores():
|
||||||
table = PrettyTable(["Task", "Success Score"])
|
table = PrettyTable(["Task"] + model_names)
|
||||||
for task, score in sorted(task_scores.items()):
|
for task in sorted(all_task_scores.keys()):
|
||||||
table.add_row([task, round(score, 2)])
|
row = [task]
|
||||||
|
for model in model_names:
|
||||||
|
score = all_task_scores[task].get(model)
|
||||||
|
if score is None:
|
||||||
|
row.append("null")
|
||||||
|
else:
|
||||||
|
row.append(round(score, 2))
|
||||||
|
table.add_row(row)
|
||||||
print("\nTask-wise Success Scores")
|
print("\nTask-wise Success Scores")
|
||||||
print(table)
|
print(table)
|
||||||
|
|
||||||
# Print all tables
|
def display_zero_and_null_score_tasks():
|
||||||
|
for model in model_names:
|
||||||
|
if zero_score_tasks[model]:
|
||||||
|
table = PrettyTable([f"{model} - Tasks with 0 Score"])
|
||||||
|
for task in zero_score_tasks[model]:
|
||||||
|
table.add_row([task])
|
||||||
|
print(f"\n{model} - Tasks with 0 Success Score")
|
||||||
|
print(table)
|
||||||
|
|
||||||
|
if null_score_tasks[model]:
|
||||||
|
table = PrettyTable([f"{model} - Tasks with Null Score"])
|
||||||
|
for task in null_score_tasks[model]:
|
||||||
|
table.add_row([task])
|
||||||
|
print(f"\n{model} - Tasks with Null Success Score")
|
||||||
|
print(table)
|
||||||
|
|
||||||
|
def display_overall_averages():
|
||||||
|
table = PrettyTable(["Metric"] + model_names)
|
||||||
|
|
||||||
|
# Overall average score (including zeros, excluding nulls)
|
||||||
|
row_with_zeros = ["Average Score (All Tasks)"]
|
||||||
|
for model in model_names:
|
||||||
|
valid_scores = [s for s in overall_scores[model] if s is not None]
|
||||||
|
avg = sum(valid_scores) / len(valid_scores) if valid_scores else 0
|
||||||
|
row_with_zeros.append(round(avg, 2))
|
||||||
|
table.add_row(row_with_zeros)
|
||||||
|
|
||||||
|
# Overall average score (excluding zeros and nulls)
|
||||||
|
row_without_zeros = ["Average Score (Completed Tasks)"]
|
||||||
|
for model in model_names:
|
||||||
|
completed_scores = [s for s in overall_scores[model] if s is not None and s > 0]
|
||||||
|
avg = sum(completed_scores) / len(completed_scores) if completed_scores else 0
|
||||||
|
row_without_zeros.append(round(avg, 2))
|
||||||
|
table.add_row(row_without_zeros)
|
||||||
|
|
||||||
|
# Task completion rate
|
||||||
|
completion_row = ["Task Completion Rate (%)"]
|
||||||
|
for model in model_names:
|
||||||
|
completion_row.append(round(model_completion_rates[model] * 100, 2))
|
||||||
|
table.add_row(completion_row)
|
||||||
|
|
||||||
|
# Total number of tasks (excluding nulls)
|
||||||
|
task_count_row = ["Total Tasks"]
|
||||||
|
for model in model_names:
|
||||||
|
valid_tasks = [task for task in all_task_scores.keys() if model in all_task_scores[task] and all_task_scores[task][model] is not None]
|
||||||
|
task_count_row.append(len(valid_tasks))
|
||||||
|
table.add_row(task_count_row)
|
||||||
|
|
||||||
|
print("\nOverall Performance Metrics")
|
||||||
|
print(table)
|
||||||
|
|
||||||
|
display_overall_averages() # Display overall averages first
|
||||||
display_task_scores()
|
display_task_scores()
|
||||||
display_table("Average Success Score by Material (Grouped by Number)", avg_material_scores)
|
display_zero_and_null_score_tasks()
|
||||||
display_table("Average Success Score by Room (Grouped by Number)", avg_room_scores)
|
display_table("Average Success Score by Material", avg_material_scores)
|
||||||
|
display_table("Average Success Score by Room", avg_room_scores)
|
||||||
|
display_table("Average Success Score by (Material, Room) Tuples", avg_material_room_scores, tuple_keys=True)
|
||||||
|
|
||||||
# Example usage (replace 'root_directory' with actual path)
|
# Example usage
|
||||||
root_directory = "experiments/exp_03-22_19-29"
|
folders = ["experiments/gpt-4o_construction_tasks", "experiments/exp_03-23_12-31"]
|
||||||
extract_success_scores(root_directory)
|
model_names = ["GPT-4o","Claude 3.5 sonnet"]
|
||||||
|
extract_success_scores(folders, model_names)
|
|
@ -20,15 +20,11 @@ def extract_cooking_items(exp_dir):
|
||||||
|
|
||||||
return items
|
return items
|
||||||
|
|
||||||
def analyze_experiments(root_dir):
|
def analyze_experiments(root_dir, model_name):
|
||||||
# Store results by number of blocked agents
|
# Store results by number of blocked agents
|
||||||
blocked_access_results = defaultdict(lambda: {
|
blocked_access_results = defaultdict(lambda: {
|
||||||
"success": 0,
|
"success": 0,
|
||||||
"total": 0,
|
"total": 0
|
||||||
"cake_success": 0,
|
|
||||||
"cake_total": 0,
|
|
||||||
"non_cake_success": 0,
|
|
||||||
"non_cake_total": 0
|
|
||||||
})
|
})
|
||||||
|
|
||||||
# Store results by cooking item
|
# Store results by cooking item
|
||||||
|
@ -51,9 +47,6 @@ def analyze_experiments(root_dir):
|
||||||
# Add to unique items set
|
# Add to unique items set
|
||||||
all_cooking_items.update(cooking_items)
|
all_cooking_items.update(cooking_items)
|
||||||
|
|
||||||
# Check if experiment involves cake
|
|
||||||
has_cake = any(item == "cake" for item in cooking_items)
|
|
||||||
|
|
||||||
# Extract blocked access information from directory name
|
# Extract blocked access information from directory name
|
||||||
blocked_access_match = re.search(r'blocked_access_([0-9_]+)$', exp_dir)
|
blocked_access_match = re.search(r'blocked_access_([0-9_]+)$', exp_dir)
|
||||||
|
|
||||||
|
@ -104,119 +97,284 @@ def analyze_experiments(root_dir):
|
||||||
if is_successful:
|
if is_successful:
|
||||||
cooking_item_results[item]["success"] += 1
|
cooking_item_results[item]["success"] += 1
|
||||||
|
|
||||||
# Update the appropriate blocked access counters
|
# Update the blocked access counters
|
||||||
# First update the category-specific counters
|
blocked_access_results[blocked_key]["total"] += 1
|
||||||
if has_cake:
|
if is_successful:
|
||||||
blocked_access_results[blocked_key]["cake_total"] += 1
|
blocked_access_results[blocked_key]["success"] += 1
|
||||||
if is_successful:
|
|
||||||
blocked_access_results[blocked_key]["cake_success"] += 1
|
|
||||||
else:
|
|
||||||
blocked_access_results[blocked_key]["non_cake_total"] += 1
|
|
||||||
if is_successful:
|
|
||||||
blocked_access_results[blocked_key]["non_cake_success"] += 1
|
|
||||||
|
|
||||||
# Only count non-cake experiments in the main totals
|
|
||||||
blocked_access_results[blocked_key]["total"] += 1
|
|
||||||
if is_successful:
|
|
||||||
blocked_access_results[blocked_key]["success"] += 1
|
|
||||||
|
|
||||||
return blocked_access_results, cooking_item_results, all_cooking_items
|
return blocked_access_results, cooking_item_results, all_cooking_items
|
||||||
|
|
||||||
def print_blocked_results(results):
|
def print_model_comparison_blocked(models_results):
|
||||||
print("\nExperiment Results by Number of Agents with Blocked Access (Excluding Cake Experiments):")
|
print("\nModel Comparison by Number of Agents with Blocked Access:")
|
||||||
print("=" * 80)
|
print("=" * 100)
|
||||||
print(f"{'Blocked Agents':<15} | {'Success Rate':<15} | {'Success/Total':<15} | {'Cake Tasks':<15} | {'Non-Cake Tasks':<15}")
|
|
||||||
print("-" * 80)
|
|
||||||
|
|
||||||
# Calculate totals
|
# Get all possible blocked access keys
|
||||||
total_success = 0
|
all_blocked_keys = set()
|
||||||
total_experiments = 0
|
for model_results in models_results.values():
|
||||||
total_cake = 0
|
all_blocked_keys.update(model_results.keys())
|
||||||
total_non_cake = 0
|
|
||||||
|
|
||||||
# Sort by number of blocked agents
|
# Sort the keys
|
||||||
for key in sorted(results.keys(), key=lambda x: int(x.split()[0])):
|
sorted_keys = sorted(all_blocked_keys, key=lambda x: int(x.split()[0]))
|
||||||
success = results[key]["success"]
|
|
||||||
total = results[key]["total"]
|
# Create the header
|
||||||
cake_total = results[key]["cake_total"]
|
header = f"{'Blocked Agents':<15} | "
|
||||||
non_cake_total = results[key]["non_cake_total"]
|
for model_name in models_results.keys():
|
||||||
|
header += f"{model_name+' Success Rate':<20} | {model_name+' Success/Total':<20} | "
|
||||||
|
print(header)
|
||||||
|
print("-" * 100)
|
||||||
|
|
||||||
|
# Calculate and print the results for each blocked key
|
||||||
|
model_totals = {model: {"success": 0, "total": 0} for model in models_results.keys()}
|
||||||
|
|
||||||
|
for key in sorted_keys:
|
||||||
|
row = f"{key:<15} | "
|
||||||
|
|
||||||
# Verify that non_cake_total matches total
|
for model_name, model_results in models_results.items():
|
||||||
if non_cake_total != total:
|
if key in model_results:
|
||||||
print(f"Warning: Non-cake total ({non_cake_total}) doesn't match the total ({total}) for {key}")
|
success = model_results[key]["success"]
|
||||||
|
total = model_results[key]["total"]
|
||||||
total_success += success
|
|
||||||
total_experiments += total
|
model_totals[model_name]["success"] += success
|
||||||
total_cake += cake_total
|
model_totals[model_name]["total"] += total
|
||||||
total_non_cake += non_cake_total
|
|
||||||
|
success_rate = (success / total * 100) if total > 0 else 0
|
||||||
|
row += f"{success_rate:>6.2f}%{'':<12} | {success}/{total}{'':<12} | "
|
||||||
|
else:
|
||||||
|
row += f"{'N/A':<19} | {'N/A':<19} | "
|
||||||
|
|
||||||
|
print(row)
|
||||||
|
|
||||||
|
# Print the overall results
|
||||||
|
print("-" * 100)
|
||||||
|
row = f"{'Overall':<15} | "
|
||||||
|
|
||||||
|
for model_name, totals in model_totals.items():
|
||||||
|
success = totals["success"]
|
||||||
|
total = totals["total"]
|
||||||
success_rate = (success / total * 100) if total > 0 else 0
|
success_rate = (success / total * 100) if total > 0 else 0
|
||||||
|
row += f"{success_rate:>6.2f}%{'':<12} | {success}/{total}{'':<12} | "
|
||||||
print(f"{key:<15} | {success_rate:>6.2f}% | {success}/{total:<13} | {cake_total:<15} | {non_cake_total:<15}")
|
|
||||||
|
|
||||||
# Calculate overall success rate (excluding cake experiments)
|
print(row)
|
||||||
overall_success_rate = (total_success / total_experiments * 100) if total_experiments > 0 else 0
|
|
||||||
|
|
||||||
print("-" * 80)
|
|
||||||
print(f"{'Overall':<15} | {overall_success_rate:>6.2f}% | {total_success}/{total_experiments:<13} | {total_cake:<15} | {total_non_cake:<15}")
|
|
||||||
|
|
||||||
# Print cake experiment details
|
|
||||||
print("\nCake Experiment Details:")
|
|
||||||
print("=" * 60)
|
|
||||||
print(f"{'Blocked Agents':<15} | {'Success Rate':<15} | {'Success/Total':<15}")
|
|
||||||
print("-" * 60)
|
|
||||||
|
|
||||||
cake_total_success = 0
|
|
||||||
cake_total_experiments = 0
|
|
||||||
|
|
||||||
for key in sorted(results.keys(), key=lambda x: int(x.split()[0])):
|
|
||||||
cake_success = results[key]["cake_success"]
|
|
||||||
cake_total = results[key]["cake_total"]
|
|
||||||
|
|
||||||
cake_total_success += cake_success
|
|
||||||
cake_total_experiments += cake_total
|
|
||||||
|
|
||||||
cake_success_rate = (cake_success / cake_total * 100) if cake_total > 0 else 0
|
|
||||||
|
|
||||||
print(f"{key:<15} | {cake_success_rate:>6.2f}% | {cake_success}/{cake_total}")
|
|
||||||
|
|
||||||
cake_overall_success_rate = (cake_total_success / cake_total_experiments * 100) if cake_total_experiments > 0 else 0
|
|
||||||
|
|
||||||
print("-" * 60)
|
|
||||||
print(f"{'Overall':<15} | {cake_overall_success_rate:>6.2f}% | {cake_total_success}/{cake_total_experiments}")
|
|
||||||
|
|
||||||
def print_cooking_items(cooking_items):
|
def print_model_comparison_items(models_item_results, all_cooking_items):
|
||||||
print("\nUnique Cooking Items Found:")
|
print("\nModel Comparison by Cooking Item:")
|
||||||
print("=" * 60)
|
print("=" * 100)
|
||||||
print(", ".join(sorted(cooking_items)))
|
|
||||||
print(f"Total unique items: {len(cooking_items)}")
|
|
||||||
|
|
||||||
def print_item_results(item_results):
|
|
||||||
print("\nExperiment Results by Cooking Item:")
|
|
||||||
print("=" * 60)
|
|
||||||
print(f"{'Cooking Item':<20} | {'Success Rate':<15} | {'Success/Total':<15}")
|
|
||||||
print("-" * 60)
|
|
||||||
|
|
||||||
# Sort by item name
|
# Create the header
|
||||||
for item in sorted(item_results.keys()):
|
header = f"{'Cooking Item':<20} | "
|
||||||
success = item_results[item]["success"]
|
for model_name in models_item_results.keys():
|
||||||
total = item_results[item]["total"]
|
header += f"{model_name+' Success Rate':<20} | {model_name+' Success/Total':<20} | "
|
||||||
|
print(header)
|
||||||
|
print("-" * 100)
|
||||||
|
|
||||||
|
# Calculate and print the results for each cooking item
|
||||||
|
model_totals = {model: {"success": 0, "total": 0} for model in models_item_results.keys()}
|
||||||
|
|
||||||
|
for item in sorted(all_cooking_items):
|
||||||
|
row = f"{item:<20} | "
|
||||||
|
|
||||||
|
for model_name, model_results in models_item_results.items():
|
||||||
|
if item in model_results:
|
||||||
|
success = model_results[item]["success"]
|
||||||
|
total = model_results[item]["total"]
|
||||||
|
|
||||||
|
model_totals[model_name]["success"] += success
|
||||||
|
model_totals[model_name]["total"] += total
|
||||||
|
|
||||||
|
success_rate = (success / total * 100) if total > 0 else 0
|
||||||
|
row += f"{success_rate:>6.2f}%{'':<12} | {success}/{total}{'':<12} | "
|
||||||
|
else:
|
||||||
|
row += f"{'N/A':<19} | {'N/A':<19} | "
|
||||||
|
|
||||||
|
print(row)
|
||||||
|
|
||||||
|
# Print the overall results
|
||||||
|
print("-" * 100)
|
||||||
|
row = f"{'Overall':<20} | "
|
||||||
|
|
||||||
|
for model_name, totals in model_totals.items():
|
||||||
|
success = totals["success"]
|
||||||
|
total = totals["total"]
|
||||||
success_rate = (success / total * 100) if total > 0 else 0
|
success_rate = (success / total * 100) if total > 0 else 0
|
||||||
|
row += f"{success_rate:>6.2f}%{'':<12} | {success}/{total}{'':<12} | "
|
||||||
print(f"{item:<20} | {success_rate:>6.2f}% | {success}/{total}")
|
|
||||||
|
|
||||||
print("-" * 60)
|
print(row)
|
||||||
|
|
||||||
|
def print_model_comparison_items_by_blocked(models_data, all_cooking_items):
|
||||||
|
print("\nDetailed Model Comparison by Cooking Item and Blocked Agent Count:")
|
||||||
|
print("=" * 120)
|
||||||
|
|
||||||
|
# For each cooking item, create a comparison table by blocked agent count
|
||||||
|
for item in sorted(all_cooking_items):
|
||||||
|
print(f"\nResults for cooking item: {item}")
|
||||||
|
print("-" * 100)
|
||||||
|
|
||||||
|
# Create the header
|
||||||
|
header = f"{'Blocked Agents':<15} | "
|
||||||
|
for model_name in models_data.keys():
|
||||||
|
header += f"{model_name+' Success Rate':<20} | {model_name+' Success/Total':<20} | "
|
||||||
|
print(header)
|
||||||
|
print("-" * 100)
|
||||||
|
|
||||||
|
# Get all possible blocked agent counts
|
||||||
|
all_blocked_keys = set()
|
||||||
|
for model_name, model_data in models_data.items():
|
||||||
|
_, _, item_blocked_data = model_data
|
||||||
|
for blocked_key in item_blocked_data.get(item, {}).keys():
|
||||||
|
all_blocked_keys.add(blocked_key)
|
||||||
|
|
||||||
|
# Sort the keys
|
||||||
|
sorted_keys = sorted(all_blocked_keys, key=lambda x: int(x.split()[0]))
|
||||||
|
|
||||||
|
# Print each row
|
||||||
|
for blocked_key in sorted_keys:
|
||||||
|
row = f"{blocked_key:<15} | "
|
||||||
|
|
||||||
|
for model_name, model_data in models_data.items():
|
||||||
|
_, _, item_blocked_data = model_data
|
||||||
|
|
||||||
|
if item in item_blocked_data and blocked_key in item_blocked_data[item]:
|
||||||
|
success = item_blocked_data[item][blocked_key]["success"]
|
||||||
|
total = item_blocked_data[item][blocked_key]["total"]
|
||||||
|
|
||||||
|
if total > 0:
|
||||||
|
success_rate = (success / total * 100)
|
||||||
|
row += f"{success_rate:>6.2f}%{'':<12} | {success}/{total}{'':<12} | "
|
||||||
|
else:
|
||||||
|
row += f"{'N/A':<19} | {'0/0':<19} | "
|
||||||
|
else:
|
||||||
|
row += f"{'N/A':<19} | {'N/A':<19} | "
|
||||||
|
|
||||||
|
print(row)
|
||||||
|
|
||||||
|
# Print item summary for each model
|
||||||
|
print("-" * 100)
|
||||||
|
row = f"{'Overall':<15} | "
|
||||||
|
|
||||||
|
for model_name, model_data in models_data.items():
|
||||||
|
_, item_results, _ = model_data
|
||||||
|
|
||||||
|
if item in item_results:
|
||||||
|
success = item_results[item]["success"]
|
||||||
|
total = item_results[item]["total"]
|
||||||
|
|
||||||
|
if total > 0:
|
||||||
|
success_rate = (success / total * 100)
|
||||||
|
row += f"{success_rate:>6.2f}%{'':<12} | {success}/{total}{'':<12} | "
|
||||||
|
else:
|
||||||
|
row += f"{'N/A':<19} | {'0/0':<19} | "
|
||||||
|
else:
|
||||||
|
row += f"{'N/A':<19} | {'N/A':<19} | "
|
||||||
|
|
||||||
|
print(row)
|
||||||
|
|
||||||
|
def generate_item_blocked_data(experiments_root):
|
||||||
|
# Organize data by item and blocked agent count
|
||||||
|
item_blocked_data = defaultdict(lambda: defaultdict(lambda: {"success": 0, "total": 0}))
|
||||||
|
|
||||||
|
# Populate the data structure
|
||||||
|
for exp_dir in os.listdir(experiments_root):
|
||||||
|
if not os.path.isdir(os.path.join(experiments_root, exp_dir)) or not exp_dir.startswith("multiagent_cooking_"):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Extract cooking items
|
||||||
|
cooking_items = extract_cooking_items(exp_dir)
|
||||||
|
|
||||||
|
# Extract blocked access information
|
||||||
|
blocked_access_match = re.search(r'blocked_access_([0-9_]+)$', exp_dir)
|
||||||
|
if blocked_access_match:
|
||||||
|
blocked_access_str = blocked_access_match.group(1)
|
||||||
|
num_blocked_agents = len(blocked_access_str.split('_'))
|
||||||
|
blocked_key = f"{num_blocked_agents} agent(s)"
|
||||||
|
else:
|
||||||
|
blocked_key = "0 agent(s)"
|
||||||
|
|
||||||
|
# Check if the task was successful
|
||||||
|
is_successful = False
|
||||||
|
full_exp_path = os.path.join(experiments_root, exp_dir)
|
||||||
|
agent_files = [f for f in os.listdir(full_exp_path) if f.endswith(".json")]
|
||||||
|
|
||||||
|
for agent_file in agent_files:
|
||||||
|
try:
|
||||||
|
with open(os.path.join(full_exp_path, agent_file), 'r') as f:
|
||||||
|
agent_data = json.load(f)
|
||||||
|
|
||||||
|
if "turns" in agent_data:
|
||||||
|
for turn in agent_data["turns"]:
|
||||||
|
if turn.get("role") == "system" and "content" in turn:
|
||||||
|
if isinstance(turn["content"], str) and "Task ended with score : 1" in turn["content"]:
|
||||||
|
is_successful = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if is_successful:
|
||||||
|
break
|
||||||
|
except:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Update the item-blocked data
|
||||||
|
for item in cooking_items:
|
||||||
|
item_blocked_data[item][blocked_key]["total"] += 1
|
||||||
|
if is_successful:
|
||||||
|
item_blocked_data[item][blocked_key]["success"] += 1
|
||||||
|
|
||||||
|
return item_blocked_data
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
# Update this path to your experiments directory
|
base_dir = "experiments"
|
||||||
experiments_root = "../results/llama_70b_hells_kitchen_cooking_tasks"
|
|
||||||
|
|
||||||
print(f"Analyzing experiments in: {os.path.abspath(experiments_root)}")
|
# Get the model directories
|
||||||
blocked_results, item_results, unique_items = analyze_experiments(experiments_root)
|
all_model_dirs = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]
|
||||||
|
gpt_dirs = [d for d in all_model_dirs if d.startswith("gpt-4o_30_cooking_tasks")]
|
||||||
|
claude_dirs = [d for d in all_model_dirs if d.startswith("llama_70b_30_cooking_tasks")]
|
||||||
|
|
||||||
print_blocked_results(blocked_results)
|
if not gpt_dirs or not claude_dirs:
|
||||||
print_cooking_items(unique_items)
|
print("Error: Could not find both model directories. Please check your paths.")
|
||||||
print_item_results(item_results)
|
return
|
||||||
|
|
||||||
|
# Use the first directory found for each model
|
||||||
|
gpt_dir = os.path.join(base_dir, gpt_dirs[0])
|
||||||
|
claude_dir = os.path.join(base_dir, claude_dirs[0])
|
||||||
|
|
||||||
|
print(f"Analyzing GPT-4o experiments in: {gpt_dir}")
|
||||||
|
print(f"Analyzing Claude-3.5-Sonnet experiments in: {claude_dir}")
|
||||||
|
|
||||||
|
# Analyze each model directory
|
||||||
|
gpt_blocked_results, gpt_item_results, gpt_unique_items = analyze_experiments(gpt_dir, "GPT-4o")
|
||||||
|
claude_blocked_results, claude_item_results, claude_unique_items = analyze_experiments(claude_dir, "Claude-3.5")
|
||||||
|
|
||||||
|
# Combine unique cooking items
|
||||||
|
all_cooking_items = gpt_unique_items.union(claude_unique_items)
|
||||||
|
|
||||||
|
# Generate item-blocked data for each model
|
||||||
|
gpt_item_blocked_data = generate_item_blocked_data(gpt_dir)
|
||||||
|
claude_item_blocked_data = generate_item_blocked_data(claude_dir)
|
||||||
|
|
||||||
|
# Create model comparison data structures
|
||||||
|
models_blocked_results = {
|
||||||
|
"GPT-4o": gpt_blocked_results,
|
||||||
|
"Claude-3.5": claude_blocked_results
|
||||||
|
}
|
||||||
|
|
||||||
|
models_item_results = {
|
||||||
|
"GPT-4o": gpt_item_results,
|
||||||
|
"Claude-3.5": claude_item_results
|
||||||
|
}
|
||||||
|
|
||||||
|
models_data = {
|
||||||
|
"GPT-4o": (gpt_blocked_results, gpt_item_results, gpt_item_blocked_data),
|
||||||
|
"Claude-3.5": (claude_blocked_results, claude_item_results, claude_item_blocked_data)
|
||||||
|
}
|
||||||
|
|
||||||
|
# Print the comparison tables
|
||||||
|
print_model_comparison_blocked(models_blocked_results)
|
||||||
|
print_model_comparison_items(models_item_results, all_cooking_items)
|
||||||
|
print_model_comparison_items_by_blocked(models_data, all_cooking_items)
|
||||||
|
|
||||||
|
# Print overall statistics
|
||||||
|
print("\nUnique Cooking Items Found:")
|
||||||
|
print("=" * 60)
|
||||||
|
print(", ".join(sorted(all_cooking_items)))
|
||||||
|
print(f"Total unique items: {len(all_cooking_items)}")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
Loading…
Add table
Reference in a new issue