diff --git a/analyze_construction_tasks.py b/analyze_construction_tasks.py index ee1a11d..7593d1a 100644 --- a/analyze_construction_tasks.py +++ b/analyze_construction_tasks.py @@ -4,86 +4,170 @@ from collections import defaultdict from prettytable import PrettyTable import re -def extract_success_scores(root_dir): - task_scores = {} # Stores task-wise scores - material_groups = defaultdict(list) - room_groups = defaultdict(list) +def extract_success_scores(folders, model_names): + assert len(folders) == len(model_names), "Folders and model names lists must have the same length." + + all_task_scores = defaultdict(dict) # Stores task-wise scores per model + zero_score_tasks = defaultdict(list) # Stores tasks with 0 score per model + null_score_tasks = defaultdict(list) # Stores tasks with null score per model + material_groups = defaultdict(lambda: defaultdict(list)) + room_groups = defaultdict(lambda: defaultdict(list)) + material_room_groups = defaultdict(lambda: defaultdict(list)) + overall_scores = defaultdict(list) # New dict to store all scores for each model - # Regex pattern to extract material and room numbers pattern = re.compile(r"materials_(\d+)_rooms_(\d+)") - - # Iterate through each task folder - for task_folder in os.listdir(root_dir): - task_path = os.path.join(root_dir, task_folder) - if os.path.isdir(task_path): - logs_found = False # Flag to track if logs exist - - # Check for JSON files - for file_name in os.listdir(task_path): - if file_name.endswith(".json"): - logs_found = True # JSON file exists - file_path = os.path.join(task_path, file_name) - - # Read JSON file - try: - with open(file_path, 'r') as file: - data = json.load(file) - - # Extract success score from the last system message - for turn in reversed(data.get("turns", [])): - if turn["role"] == "system" and "Task ended with score" in turn["content"]: - score = float(turn["content"].split(":")[-1].strip()) - task_scores[task_folder] = score # Store per-task score - break # Stop searching if found - - # Stop checking other files in the folder if score is found - if task_folder in task_scores: + + for root_dir, model_name in zip(folders, model_names): + for task_folder in os.listdir(root_dir): + task_path = os.path.join(root_dir, task_folder) + if os.path.isdir(task_path): + logs_found = False + score_found = False + + for file_name in os.listdir(task_path): + if file_name.endswith(".json"): + logs_found = True + file_path = os.path.join(task_path, file_name) + + try: + with open(file_path, 'r') as file: + data = json.load(file) + + for turn in reversed(data.get("turns", [])): + if turn["role"] == "system" and "Task ended with score" in turn["content"]: + score = float(turn["content"].split(":")[-1].strip()) + all_task_scores[task_folder][model_name] = score + overall_scores[model_name].append(score) # Add to overall scores + score_found = True + + if score == 0: + zero_score_tasks[model_name].append(task_folder) + break + + if score_found: break - except Exception as e: - print(f"Error reading {file_path}: {e}") - - # If no logs were found, print a message - if not logs_found: - print(f"No log files found in {task_folder}") - - # Group scores by material and room - for task, score in task_scores.items(): + except Exception as e: + print(f"Error reading {file_path}: {e}") + + if logs_found and not score_found: + # Score not found but logs exist - mark as null + all_task_scores[task_folder][model_name] = None + null_score_tasks[model_name].append(task_folder) + + if not logs_found: + print(f"No log files found in {task_folder}") + + # Calculate model completion rates (ignore null scores) + model_completion_rates = {} + for model_name in model_names: + valid_tasks = [task for task in all_task_scores.keys() if model_name in all_task_scores[task] and all_task_scores[task][model_name] is not None] + total_tasks = len(valid_tasks) + completed_tasks = len([task for task in valid_tasks if all_task_scores[task][model_name] > 0]) + model_completion_rates[model_name] = (completed_tasks / total_tasks) if total_tasks > 0 else 0 + + # Process task scores into groups (ignore null and 0 scores) + for task, model_scores in all_task_scores.items(): match = pattern.search(task) if match: - material = int(match.group(1)) # Extract material number - room = int(match.group(2)) # Extract room number - material_groups[material].append(score) - room_groups[room].append(score) - else: - print(f"Warning: Task folder '{task}' does not match expected format.") - - # Calculate average scores + material = int(match.group(1)) + room = int(match.group(2)) + + for model, score in model_scores.items(): + if score is not None and score > 0: # Ignore null and 0 scores + material_groups[material][model].append(score) + room_groups[room][model].append(score) + material_room_groups[(material, room)][model].append(score) + def calculate_average(group): - return {key: sum(values) / len(values) for key, values in group.items()} - + return {key: {model: sum(scores) / len(scores) for model, scores in models.items() if scores} + for key, models in group.items() if models} + avg_material_scores = calculate_average(material_groups) avg_room_scores = calculate_average(room_groups) - - # Display results using PrettyTable - def display_table(title, data): - table = PrettyTable(["Category", "Average Score"]) - for key, value in sorted(data.items()): - table.add_row([key, round(value, 2)]) + avg_material_room_scores = calculate_average(material_room_groups) + + def display_table(title, data, tuple_keys=False): + table = PrettyTable(["Category"] + model_names) + for key, model_scores in sorted(data.items()): + key_display = key if not tuple_keys else f"({key[0]}, {key[1]})" + row = [key_display] + [round(model_scores.get(model, 0), 2) for model in model_names] + table.add_row(row) print(f"\n{title}") print(table) - + def display_task_scores(): - table = PrettyTable(["Task", "Success Score"]) - for task, score in sorted(task_scores.items()): - table.add_row([task, round(score, 2)]) + table = PrettyTable(["Task"] + model_names) + for task in sorted(all_task_scores.keys()): + row = [task] + for model in model_names: + score = all_task_scores[task].get(model) + if score is None: + row.append("null") + else: + row.append(round(score, 2)) + table.add_row(row) print("\nTask-wise Success Scores") print(table) - - # Print all tables + + def display_zero_and_null_score_tasks(): + for model in model_names: + if zero_score_tasks[model]: + table = PrettyTable([f"{model} - Tasks with 0 Score"]) + for task in zero_score_tasks[model]: + table.add_row([task]) + print(f"\n{model} - Tasks with 0 Success Score") + print(table) + + if null_score_tasks[model]: + table = PrettyTable([f"{model} - Tasks with Null Score"]) + for task in null_score_tasks[model]: + table.add_row([task]) + print(f"\n{model} - Tasks with Null Success Score") + print(table) + + def display_overall_averages(): + table = PrettyTable(["Metric"] + model_names) + + # Overall average score (including zeros, excluding nulls) + row_with_zeros = ["Average Score (All Tasks)"] + for model in model_names: + valid_scores = [s for s in overall_scores[model] if s is not None] + avg = sum(valid_scores) / len(valid_scores) if valid_scores else 0 + row_with_zeros.append(round(avg, 2)) + table.add_row(row_with_zeros) + + # Overall average score (excluding zeros and nulls) + row_without_zeros = ["Average Score (Completed Tasks)"] + for model in model_names: + completed_scores = [s for s in overall_scores[model] if s is not None and s > 0] + avg = sum(completed_scores) / len(completed_scores) if completed_scores else 0 + row_without_zeros.append(round(avg, 2)) + table.add_row(row_without_zeros) + + # Task completion rate + completion_row = ["Task Completion Rate (%)"] + for model in model_names: + completion_row.append(round(model_completion_rates[model] * 100, 2)) + table.add_row(completion_row) + + # Total number of tasks (excluding nulls) + task_count_row = ["Total Tasks"] + for model in model_names: + valid_tasks = [task for task in all_task_scores.keys() if model in all_task_scores[task] and all_task_scores[task][model] is not None] + task_count_row.append(len(valid_tasks)) + table.add_row(task_count_row) + + print("\nOverall Performance Metrics") + print(table) + + display_overall_averages() # Display overall averages first display_task_scores() - display_table("Average Success Score by Material (Grouped by Number)", avg_material_scores) - display_table("Average Success Score by Room (Grouped by Number)", avg_room_scores) + display_zero_and_null_score_tasks() + display_table("Average Success Score by Material", avg_material_scores) + display_table("Average Success Score by Room", avg_room_scores) + display_table("Average Success Score by (Material, Room) Tuples", avg_material_room_scores, tuple_keys=True) -# Example usage (replace 'root_directory' with actual path) -root_directory = "experiments/exp_03-22_19-29" -extract_success_scores(root_directory) \ No newline at end of file +# Example usage +folders = ["experiments/gpt-4o_construction_tasks", "experiments/exp_03-23_12-31"] +model_names = ["GPT-4o","Claude 3.5 sonnet"] +extract_success_scores(folders, model_names) \ No newline at end of file diff --git a/analyze_cooking_tasks.py b/analyze_cooking_tasks.py index d727d5b..94eef6d 100644 --- a/analyze_cooking_tasks.py +++ b/analyze_cooking_tasks.py @@ -20,15 +20,11 @@ def extract_cooking_items(exp_dir): return items -def analyze_experiments(root_dir): +def analyze_experiments(root_dir, model_name): # Store results by number of blocked agents blocked_access_results = defaultdict(lambda: { "success": 0, - "total": 0, - "cake_success": 0, - "cake_total": 0, - "non_cake_success": 0, - "non_cake_total": 0 + "total": 0 }) # Store results by cooking item @@ -51,9 +47,6 @@ def analyze_experiments(root_dir): # Add to unique items set all_cooking_items.update(cooking_items) - # Check if experiment involves cake - has_cake = any(item == "cake" for item in cooking_items) - # Extract blocked access information from directory name blocked_access_match = re.search(r'blocked_access_([0-9_]+)$', exp_dir) @@ -104,119 +97,284 @@ def analyze_experiments(root_dir): if is_successful: cooking_item_results[item]["success"] += 1 - # Update the appropriate blocked access counters - # First update the category-specific counters - if has_cake: - blocked_access_results[blocked_key]["cake_total"] += 1 - if is_successful: - blocked_access_results[blocked_key]["cake_success"] += 1 - else: - blocked_access_results[blocked_key]["non_cake_total"] += 1 - if is_successful: - blocked_access_results[blocked_key]["non_cake_success"] += 1 - - # Only count non-cake experiments in the main totals - blocked_access_results[blocked_key]["total"] += 1 - if is_successful: - blocked_access_results[blocked_key]["success"] += 1 + # Update the blocked access counters + blocked_access_results[blocked_key]["total"] += 1 + if is_successful: + blocked_access_results[blocked_key]["success"] += 1 return blocked_access_results, cooking_item_results, all_cooking_items -def print_blocked_results(results): - print("\nExperiment Results by Number of Agents with Blocked Access (Excluding Cake Experiments):") - print("=" * 80) - print(f"{'Blocked Agents':<15} | {'Success Rate':<15} | {'Success/Total':<15} | {'Cake Tasks':<15} | {'Non-Cake Tasks':<15}") - print("-" * 80) +def print_model_comparison_blocked(models_results): + print("\nModel Comparison by Number of Agents with Blocked Access:") + print("=" * 100) - # Calculate totals - total_success = 0 - total_experiments = 0 - total_cake = 0 - total_non_cake = 0 + # Get all possible blocked access keys + all_blocked_keys = set() + for model_results in models_results.values(): + all_blocked_keys.update(model_results.keys()) - # Sort by number of blocked agents - for key in sorted(results.keys(), key=lambda x: int(x.split()[0])): - success = results[key]["success"] - total = results[key]["total"] - cake_total = results[key]["cake_total"] - non_cake_total = results[key]["non_cake_total"] + # Sort the keys + sorted_keys = sorted(all_blocked_keys, key=lambda x: int(x.split()[0])) + + # Create the header + header = f"{'Blocked Agents':<15} | " + for model_name in models_results.keys(): + header += f"{model_name+' Success Rate':<20} | {model_name+' Success/Total':<20} | " + print(header) + print("-" * 100) + + # Calculate and print the results for each blocked key + model_totals = {model: {"success": 0, "total": 0} for model in models_results.keys()} + + for key in sorted_keys: + row = f"{key:<15} | " - # Verify that non_cake_total matches total - if non_cake_total != total: - print(f"Warning: Non-cake total ({non_cake_total}) doesn't match the total ({total}) for {key}") - - total_success += success - total_experiments += total - total_cake += cake_total - total_non_cake += non_cake_total + for model_name, model_results in models_results.items(): + if key in model_results: + success = model_results[key]["success"] + total = model_results[key]["total"] + + model_totals[model_name]["success"] += success + model_totals[model_name]["total"] += total + + success_rate = (success / total * 100) if total > 0 else 0 + row += f"{success_rate:>6.2f}%{'':<12} | {success}/{total}{'':<12} | " + else: + row += f"{'N/A':<19} | {'N/A':<19} | " + print(row) + + # Print the overall results + print("-" * 100) + row = f"{'Overall':<15} | " + + for model_name, totals in model_totals.items(): + success = totals["success"] + total = totals["total"] success_rate = (success / total * 100) if total > 0 else 0 - - print(f"{key:<15} | {success_rate:>6.2f}% | {success}/{total:<13} | {cake_total:<15} | {non_cake_total:<15}") + row += f"{success_rate:>6.2f}%{'':<12} | {success}/{total}{'':<12} | " - # Calculate overall success rate (excluding cake experiments) - overall_success_rate = (total_success / total_experiments * 100) if total_experiments > 0 else 0 - - print("-" * 80) - print(f"{'Overall':<15} | {overall_success_rate:>6.2f}% | {total_success}/{total_experiments:<13} | {total_cake:<15} | {total_non_cake:<15}") - - # Print cake experiment details - print("\nCake Experiment Details:") - print("=" * 60) - print(f"{'Blocked Agents':<15} | {'Success Rate':<15} | {'Success/Total':<15}") - print("-" * 60) - - cake_total_success = 0 - cake_total_experiments = 0 - - for key in sorted(results.keys(), key=lambda x: int(x.split()[0])): - cake_success = results[key]["cake_success"] - cake_total = results[key]["cake_total"] - - cake_total_success += cake_success - cake_total_experiments += cake_total - - cake_success_rate = (cake_success / cake_total * 100) if cake_total > 0 else 0 - - print(f"{key:<15} | {cake_success_rate:>6.2f}% | {cake_success}/{cake_total}") - - cake_overall_success_rate = (cake_total_success / cake_total_experiments * 100) if cake_total_experiments > 0 else 0 - - print("-" * 60) - print(f"{'Overall':<15} | {cake_overall_success_rate:>6.2f}% | {cake_total_success}/{cake_total_experiments}") + print(row) -def print_cooking_items(cooking_items): - print("\nUnique Cooking Items Found:") - print("=" * 60) - print(", ".join(sorted(cooking_items))) - print(f"Total unique items: {len(cooking_items)}") - -def print_item_results(item_results): - print("\nExperiment Results by Cooking Item:") - print("=" * 60) - print(f"{'Cooking Item':<20} | {'Success Rate':<15} | {'Success/Total':<15}") - print("-" * 60) +def print_model_comparison_items(models_item_results, all_cooking_items): + print("\nModel Comparison by Cooking Item:") + print("=" * 100) - # Sort by item name - for item in sorted(item_results.keys()): - success = item_results[item]["success"] - total = item_results[item]["total"] + # Create the header + header = f"{'Cooking Item':<20} | " + for model_name in models_item_results.keys(): + header += f"{model_name+' Success Rate':<20} | {model_name+' Success/Total':<20} | " + print(header) + print("-" * 100) + + # Calculate and print the results for each cooking item + model_totals = {model: {"success": 0, "total": 0} for model in models_item_results.keys()} + + for item in sorted(all_cooking_items): + row = f"{item:<20} | " + + for model_name, model_results in models_item_results.items(): + if item in model_results: + success = model_results[item]["success"] + total = model_results[item]["total"] + + model_totals[model_name]["success"] += success + model_totals[model_name]["total"] += total + + success_rate = (success / total * 100) if total > 0 else 0 + row += f"{success_rate:>6.2f}%{'':<12} | {success}/{total}{'':<12} | " + else: + row += f"{'N/A':<19} | {'N/A':<19} | " + + print(row) + + # Print the overall results + print("-" * 100) + row = f"{'Overall':<20} | " + + for model_name, totals in model_totals.items(): + success = totals["success"] + total = totals["total"] success_rate = (success / total * 100) if total > 0 else 0 - - print(f"{item:<20} | {success_rate:>6.2f}% | {success}/{total}") + row += f"{success_rate:>6.2f}%{'':<12} | {success}/{total}{'':<12} | " - print("-" * 60) + print(row) + +def print_model_comparison_items_by_blocked(models_data, all_cooking_items): + print("\nDetailed Model Comparison by Cooking Item and Blocked Agent Count:") + print("=" * 120) + + # For each cooking item, create a comparison table by blocked agent count + for item in sorted(all_cooking_items): + print(f"\nResults for cooking item: {item}") + print("-" * 100) + + # Create the header + header = f"{'Blocked Agents':<15} | " + for model_name in models_data.keys(): + header += f"{model_name+' Success Rate':<20} | {model_name+' Success/Total':<20} | " + print(header) + print("-" * 100) + + # Get all possible blocked agent counts + all_blocked_keys = set() + for model_name, model_data in models_data.items(): + _, _, item_blocked_data = model_data + for blocked_key in item_blocked_data.get(item, {}).keys(): + all_blocked_keys.add(blocked_key) + + # Sort the keys + sorted_keys = sorted(all_blocked_keys, key=lambda x: int(x.split()[0])) + + # Print each row + for blocked_key in sorted_keys: + row = f"{blocked_key:<15} | " + + for model_name, model_data in models_data.items(): + _, _, item_blocked_data = model_data + + if item in item_blocked_data and blocked_key in item_blocked_data[item]: + success = item_blocked_data[item][blocked_key]["success"] + total = item_blocked_data[item][blocked_key]["total"] + + if total > 0: + success_rate = (success / total * 100) + row += f"{success_rate:>6.2f}%{'':<12} | {success}/{total}{'':<12} | " + else: + row += f"{'N/A':<19} | {'0/0':<19} | " + else: + row += f"{'N/A':<19} | {'N/A':<19} | " + + print(row) + + # Print item summary for each model + print("-" * 100) + row = f"{'Overall':<15} | " + + for model_name, model_data in models_data.items(): + _, item_results, _ = model_data + + if item in item_results: + success = item_results[item]["success"] + total = item_results[item]["total"] + + if total > 0: + success_rate = (success / total * 100) + row += f"{success_rate:>6.2f}%{'':<12} | {success}/{total}{'':<12} | " + else: + row += f"{'N/A':<19} | {'0/0':<19} | " + else: + row += f"{'N/A':<19} | {'N/A':<19} | " + + print(row) + +def generate_item_blocked_data(experiments_root): + # Organize data by item and blocked agent count + item_blocked_data = defaultdict(lambda: defaultdict(lambda: {"success": 0, "total": 0})) + + # Populate the data structure + for exp_dir in os.listdir(experiments_root): + if not os.path.isdir(os.path.join(experiments_root, exp_dir)) or not exp_dir.startswith("multiagent_cooking_"): + continue + + # Extract cooking items + cooking_items = extract_cooking_items(exp_dir) + + # Extract blocked access information + blocked_access_match = re.search(r'blocked_access_([0-9_]+)$', exp_dir) + if blocked_access_match: + blocked_access_str = blocked_access_match.group(1) + num_blocked_agents = len(blocked_access_str.split('_')) + blocked_key = f"{num_blocked_agents} agent(s)" + else: + blocked_key = "0 agent(s)" + + # Check if the task was successful + is_successful = False + full_exp_path = os.path.join(experiments_root, exp_dir) + agent_files = [f for f in os.listdir(full_exp_path) if f.endswith(".json")] + + for agent_file in agent_files: + try: + with open(os.path.join(full_exp_path, agent_file), 'r') as f: + agent_data = json.load(f) + + if "turns" in agent_data: + for turn in agent_data["turns"]: + if turn.get("role") == "system" and "content" in turn: + if isinstance(turn["content"], str) and "Task ended with score : 1" in turn["content"]: + is_successful = True + break + + if is_successful: + break + except: + continue + + # Update the item-blocked data + for item in cooking_items: + item_blocked_data[item][blocked_key]["total"] += 1 + if is_successful: + item_blocked_data[item][blocked_key]["success"] += 1 + + return item_blocked_data def main(): - # Update this path to your experiments directory - experiments_root = "../results/llama_70b_hells_kitchen_cooking_tasks" + base_dir = "experiments" - print(f"Analyzing experiments in: {os.path.abspath(experiments_root)}") - blocked_results, item_results, unique_items = analyze_experiments(experiments_root) + # Get the model directories + all_model_dirs = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))] + gpt_dirs = [d for d in all_model_dirs if d.startswith("gpt-4o_30_cooking_tasks")] + claude_dirs = [d for d in all_model_dirs if d.startswith("llama_70b_30_cooking_tasks")] - print_blocked_results(blocked_results) - print_cooking_items(unique_items) - print_item_results(item_results) + if not gpt_dirs or not claude_dirs: + print("Error: Could not find both model directories. Please check your paths.") + return + + # Use the first directory found for each model + gpt_dir = os.path.join(base_dir, gpt_dirs[0]) + claude_dir = os.path.join(base_dir, claude_dirs[0]) + + print(f"Analyzing GPT-4o experiments in: {gpt_dir}") + print(f"Analyzing Claude-3.5-Sonnet experiments in: {claude_dir}") + + # Analyze each model directory + gpt_blocked_results, gpt_item_results, gpt_unique_items = analyze_experiments(gpt_dir, "GPT-4o") + claude_blocked_results, claude_item_results, claude_unique_items = analyze_experiments(claude_dir, "Claude-3.5") + + # Combine unique cooking items + all_cooking_items = gpt_unique_items.union(claude_unique_items) + + # Generate item-blocked data for each model + gpt_item_blocked_data = generate_item_blocked_data(gpt_dir) + claude_item_blocked_data = generate_item_blocked_data(claude_dir) + + # Create model comparison data structures + models_blocked_results = { + "GPT-4o": gpt_blocked_results, + "Claude-3.5": claude_blocked_results + } + + models_item_results = { + "GPT-4o": gpt_item_results, + "Claude-3.5": claude_item_results + } + + models_data = { + "GPT-4o": (gpt_blocked_results, gpt_item_results, gpt_item_blocked_data), + "Claude-3.5": (claude_blocked_results, claude_item_results, claude_item_blocked_data) + } + + # Print the comparison tables + print_model_comparison_blocked(models_blocked_results) + print_model_comparison_items(models_item_results, all_cooking_items) + print_model_comparison_items_by_blocked(models_data, all_cooking_items) + + # Print overall statistics + print("\nUnique Cooking Items Found:") + print("=" * 60) + print(", ".join(sorted(all_cooking_items))) + print(f"Total unique items: {len(all_cooking_items)}") if __name__ == "__main__": main() \ No newline at end of file