From d39b254a06317fe6b6fac33f181302f0c68dc868 Mon Sep 17 00:00:00 2001 From: Ayush Maniar Date: Fri, 28 Mar 2025 13:13:16 -0700 Subject: [PATCH] Updates on construction and cooking tasks, prettyTable, flexibility to enter multiple folders, .... --- analyze_construction_tasks.py | 52 +++--- analyze_cooking_tasks.py | 338 +++++++++++++++++----------------- 2 files changed, 200 insertions(+), 190 deletions(-) diff --git a/analyze_construction_tasks.py b/analyze_construction_tasks.py index 7593d1a..4c1f94f 100644 --- a/analyze_construction_tasks.py +++ b/analyze_construction_tasks.py @@ -9,11 +9,11 @@ def extract_success_scores(folders, model_names): all_task_scores = defaultdict(dict) # Stores task-wise scores per model zero_score_tasks = defaultdict(list) # Stores tasks with 0 score per model - null_score_tasks = defaultdict(list) # Stores tasks with null score per model material_groups = defaultdict(lambda: defaultdict(list)) room_groups = defaultdict(lambda: defaultdict(list)) material_room_groups = defaultdict(lambda: defaultdict(list)) overall_scores = defaultdict(list) # New dict to store all scores for each model + skipped_tasks = defaultdict(list) # Stores tasks with no score message per model pattern = re.compile(r"materials_(\d+)_rooms_(\d+)") @@ -50,22 +50,22 @@ def extract_success_scores(folders, model_names): print(f"Error reading {file_path}: {e}") if logs_found and not score_found: - # Score not found but logs exist - mark as null - all_task_scores[task_folder][model_name] = None - null_score_tasks[model_name].append(task_folder) + # Score not found but logs exist - skip this task + skipped_tasks[model_name].append(task_folder) + print(f"Error: No score message found for task '{task_folder}' with model '{model_name}'. Skipping this task.") if not logs_found: print(f"No log files found in {task_folder}") - # Calculate model completion rates (ignore null scores) + # Calculate model completion rates (only consider tasks with scores) model_completion_rates = {} for model_name in model_names: - valid_tasks = [task for task in all_task_scores.keys() if model_name in all_task_scores[task] and all_task_scores[task][model_name] is not None] + valid_tasks = [task for task in all_task_scores.keys() if model_name in all_task_scores[task]] total_tasks = len(valid_tasks) completed_tasks = len([task for task in valid_tasks if all_task_scores[task][model_name] > 0]) model_completion_rates[model_name] = (completed_tasks / total_tasks) if total_tasks > 0 else 0 - # Process task scores into groups (ignore null and 0 scores) + # Process task scores into groups (ignore 0 scores) for task, model_scores in all_task_scores.items(): match = pattern.search(task) if match: @@ -73,7 +73,7 @@ def extract_success_scores(folders, model_names): room = int(match.group(2)) for model, score in model_scores.items(): - if score is not None and score > 0: # Ignore null and 0 scores + if score > 0: # Ignore 0 scores material_groups[material][model].append(score) room_groups[room][model].append(score) material_room_groups[(material, room)][model].append(score) @@ -102,14 +102,14 @@ def extract_success_scores(folders, model_names): for model in model_names: score = all_task_scores[task].get(model) if score is None: - row.append("null") + row.append("-") else: row.append(round(score, 2)) table.add_row(row) print("\nTask-wise Success Scores") print(table) - def display_zero_and_null_score_tasks(): + def display_zero_and_skipped_tasks(): for model in model_names: if zero_score_tasks[model]: table = PrettyTable([f"{model} - Tasks with 0 Score"]) @@ -118,28 +118,28 @@ def extract_success_scores(folders, model_names): print(f"\n{model} - Tasks with 0 Success Score") print(table) - if null_score_tasks[model]: - table = PrettyTable([f"{model} - Tasks with Null Score"]) - for task in null_score_tasks[model]: + if skipped_tasks[model]: + table = PrettyTable([f"{model} - Skipped Tasks (No Score Message)"]) + for task in skipped_tasks[model]: table.add_row([task]) - print(f"\n{model} - Tasks with Null Success Score") + print(f"\n{model} - Skipped Tasks (No Score Message)") print(table) def display_overall_averages(): table = PrettyTable(["Metric"] + model_names) - # Overall average score (including zeros, excluding nulls) + # Overall average score (including zeros) row_with_zeros = ["Average Score (All Tasks)"] for model in model_names: - valid_scores = [s for s in overall_scores[model] if s is not None] + valid_scores = overall_scores[model] avg = sum(valid_scores) / len(valid_scores) if valid_scores else 0 row_with_zeros.append(round(avg, 2)) table.add_row(row_with_zeros) - # Overall average score (excluding zeros and nulls) + # Overall average score (excluding zeros) row_without_zeros = ["Average Score (Completed Tasks)"] for model in model_names: - completed_scores = [s for s in overall_scores[model] if s is not None and s > 0] + completed_scores = [s for s in overall_scores[model] if s > 0] avg = sum(completed_scores) / len(completed_scores) if completed_scores else 0 row_without_zeros.append(round(avg, 2)) table.add_row(row_without_zeros) @@ -150,24 +150,30 @@ def extract_success_scores(folders, model_names): completion_row.append(round(model_completion_rates[model] * 100, 2)) table.add_row(completion_row) - # Total number of tasks (excluding nulls) + # Total number of tasks task_count_row = ["Total Tasks"] for model in model_names: - valid_tasks = [task for task in all_task_scores.keys() if model in all_task_scores[task] and all_task_scores[task][model] is not None] + valid_tasks = [task for task in all_task_scores.keys() if model in all_task_scores[task]] task_count_row.append(len(valid_tasks)) table.add_row(task_count_row) + # Number of skipped tasks + skipped_count_row = ["Skipped Tasks"] + for model in model_names: + skipped_count_row.append(len(skipped_tasks[model])) + table.add_row(skipped_count_row) + print("\nOverall Performance Metrics") print(table) display_overall_averages() # Display overall averages first display_task_scores() - display_zero_and_null_score_tasks() + display_zero_and_skipped_tasks() display_table("Average Success Score by Material", avg_material_scores) display_table("Average Success Score by Room", avg_room_scores) display_table("Average Success Score by (Material, Room) Tuples", avg_material_room_scores, tuple_keys=True) # Example usage -folders = ["experiments/gpt-4o_construction_tasks", "experiments/exp_03-23_12-31"] -model_names = ["GPT-4o","Claude 3.5 sonnet"] +folders = ["experiments/gpt-4o_construction_tasks", "experiments/claude-3-5-sonnet-latest_construction_tasks"] +model_names = ["GPT-4o", "Claude 3.5 sonnet"] extract_success_scores(folders, model_names) \ No newline at end of file diff --git a/analyze_cooking_tasks.py b/analyze_cooking_tasks.py index d33be22..7575d3c 100644 --- a/analyze_cooking_tasks.py +++ b/analyze_cooking_tasks.py @@ -2,6 +2,7 @@ import os import json import re from collections import defaultdict +from prettytable import PrettyTable def extract_cooking_items(exp_dir): """Extract cooking items from experiment directory name.""" @@ -36,8 +37,8 @@ def analyze_experiments(root_dir, model_name): # Keep track of all unique cooking items all_cooking_items = set() - # Track skipped experiments - skipped_experiments = [] + # Keep track of ignored tasks + ignored_tasks = [] # Get a list of all experiment directories experiment_dirs = [d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d)) @@ -78,18 +79,18 @@ def analyze_experiments(root_dir, model_name): with open(agent_file_path, 'r') as f: agent_data = json.load(f) - # Check for success in the turns data + # Check for score information in the turns data if "turns" in agent_data: for turn in agent_data["turns"]: if turn.get("role") == "system" and "content" in turn: - if isinstance(turn["content"], str) and "Task ended with score" in turn["content"]: + if isinstance(turn["content"], str) and "Task ended with score : " in turn["content"]: score_found = True if "Task ended with score : 1" in turn["content"]: is_successful = True - break + break - # If we found score information, no need to check other files - if score_found: + # If we found success, no need to check other files + if is_successful: break except (json.JSONDecodeError, IOError) as e: @@ -97,10 +98,9 @@ def analyze_experiments(root_dir, model_name): # Continue to check other agent files instead of failing continue - # Skip experiments with no score information + # If no score information was found in any agent file, ignore this task if not score_found: - skipped_experiments.append(exp_dir) - print(f"Warning: No task score found in experiment {exp_dir} - skipping") + ignored_tasks.append(exp_dir) continue # Update cooking item results @@ -114,178 +114,195 @@ def analyze_experiments(root_dir, model_name): if is_successful: blocked_access_results[blocked_key]["success"] += 1 - return blocked_access_results, cooking_item_results, all_cooking_items, skipped_experiments + # Print information about ignored tasks + if ignored_tasks: + print(f"\n{model_name}: Ignored {len(ignored_tasks)} tasks with no score information:") + for task in ignored_tasks: + print(f" - {task}") + + return blocked_access_results, cooking_item_results, all_cooking_items, ignored_tasks def print_model_comparison_blocked(models_results): print("\nModel Comparison by Number of Agents with Blocked Access:") print("=" * 100) - + # Get all possible blocked access keys all_blocked_keys = set() for model_results in models_results.values(): all_blocked_keys.update(model_results.keys()) - + # Sort the keys sorted_keys = sorted(all_blocked_keys, key=lambda x: int(x.split()[0])) - - # Create the header - header = f"{'Blocked Agents':<15} | " - for model_name in models_results.keys(): - header += f"{model_name+' Success Rate':<20} | {model_name+' Success/Total':<20} | " - print(header) - print("-" * 100) - - # Calculate and print the results for each blocked key + + # Create the table + table = PrettyTable() + table.field_names = ["Blocked Agents"] + [ + f"{model_name} (Success Rate | Success/Total)" for model_name in models_results.keys() + ] + + # Calculate and add rows for each blocked key model_totals = {model: {"success": 0, "total": 0} for model in models_results.keys()} - + for key in sorted_keys: - row = f"{key:<15} | " - + row = [key] + for model_name, model_results in models_results.items(): if key in model_results: success = model_results[key]["success"] total = model_results[key]["total"] - + model_totals[model_name]["success"] += success model_totals[model_name]["total"] += total - + success_rate = (success / total * 100) if total > 0 else 0 - row += f"{success_rate:>6.2f}%{'':<12} | {success}/{total}{'':<12} | " + row.append(f"{success_rate:.2f}% | {success}/{total}") else: - row += f"{'N/A':<19} | {'N/A':<19} | " - - print(row) - + row.append("N/A") + + table.add_row(row) + + # Print the table + print(table) + # Print the overall results - print("-" * 100) - row = f"{'Overall':<15} | " - + overall_row = ["Overall"] for model_name, totals in model_totals.items(): success = totals["success"] total = totals["total"] success_rate = (success / total * 100) if total > 0 else 0 - row += f"{success_rate:>6.2f}%{'':<12} | {success}/{total}{'':<12} | " - - print(row) + overall_row.append(f"{success_rate:.2f}% | {success}/{total}") + + table.add_row(overall_row) + print(table) def print_model_comparison_items(models_item_results, all_cooking_items): print("\nModel Comparison by Cooking Item:") print("=" * 100) - - # Create the header - header = f"{'Cooking Item':<20} | " - for model_name in models_item_results.keys(): - header += f"{model_name+' Success Rate':<20} | {model_name+' Success/Total':<20} | " - print(header) - print("-" * 100) - - # Calculate and print the results for each cooking item + + # Create the table + table = PrettyTable() + table.field_names = ["Cooking Item"] + [ + f"{model_name} (Success Rate | Success/Total)" for model_name in models_item_results.keys() + ] + + # Calculate and add rows for each cooking item model_totals = {model: {"success": 0, "total": 0} for model in models_item_results.keys()} - + for item in sorted(all_cooking_items): - row = f"{item:<20} | " - + row = [item] + for model_name, model_results in models_item_results.items(): if item in model_results: success = model_results[item]["success"] total = model_results[item]["total"] - + model_totals[model_name]["success"] += success model_totals[model_name]["total"] += total - + success_rate = (success / total * 100) if total > 0 else 0 - row += f"{success_rate:>6.2f}%{'':<12} | {success}/{total}{'':<12} | " + row.append(f"{success_rate:.2f}% | {success}/{total}") else: - row += f"{'N/A':<19} | {'N/A':<19} | " - - print(row) - + row.append("N/A") + + table.add_row(row) + + # Print the table + print(table) + # Print the overall results - print("-" * 100) - row = f"{'Overall':<20} | " - + overall_row = ["Overall"] for model_name, totals in model_totals.items(): success = totals["success"] total = totals["total"] success_rate = (success / total * 100) if total > 0 else 0 - row += f"{success_rate:>6.2f}%{'':<12} | {success}/{total}{'':<12} | " - - print(row) + overall_row.append(f"{success_rate:.2f}% | {success}/{total}") + + table.add_row(overall_row) + print(table) def print_model_comparison_items_by_blocked(models_data, all_cooking_items): print("\nDetailed Model Comparison by Cooking Item and Blocked Agent Count:") print("=" * 120) - + # For each cooking item, create a comparison table by blocked agent count for item in sorted(all_cooking_items): print(f"\nResults for cooking item: {item}") print("-" * 100) - - # Create the header - header = f"{'Blocked Agents':<15} | " - for model_name in models_data.keys(): - header += f"{model_name+' Success Rate':<20} | {model_name+' Success/Total':<20} | " - print(header) - print("-" * 100) - + + # Create the table + table = PrettyTable() + table.field_names = ["Blocked Agents"] + [ + f"{model_name} Success Rate" for model_name in models_data.keys() + ] + [ + f"{model_name} Success/Total" for model_name in models_data.keys() + ] + # Get all possible blocked agent counts all_blocked_keys = set() for model_name, model_data in models_data.items(): _, _, item_blocked_data = model_data for blocked_key in item_blocked_data.get(item, {}).keys(): all_blocked_keys.add(blocked_key) - + # Sort the keys sorted_keys = sorted(all_blocked_keys, key=lambda x: int(x.split()[0])) - - # Print each row + + # Add rows for each blocked key for blocked_key in sorted_keys: - row = f"{blocked_key:<15} | " - + row = [blocked_key] + for model_name, model_data in models_data.items(): _, _, item_blocked_data = model_data - + if item in item_blocked_data and blocked_key in item_blocked_data[item]: success = item_blocked_data[item][blocked_key]["success"] total = item_blocked_data[item][blocked_key]["total"] - + if total > 0: success_rate = (success / total * 100) - row += f"{success_rate:>6.2f}%{'':<12} | {success}/{total}{'':<12} | " + row.append(f"{success_rate:.2f}%") + row.append(f"{success}/{total}") else: - row += f"{'N/A':<19} | {'0/0':<19} | " + row.append("N/A") + row.append("0/0") else: - row += f"{'N/A':<19} | {'N/A':<19} | " - - print(row) - + row.append("N/A") + row.append("N/A") + + table.add_row(row) + + # Print the table + print(table) + # Print item summary for each model - print("-" * 100) - row = f"{'Overall':<15} | " - + overall_row = ["Overall"] for model_name, model_data in models_data.items(): _, item_results, _ = model_data - + if item in item_results: success = item_results[item]["success"] total = item_results[item]["total"] - + if total > 0: success_rate = (success / total * 100) - row += f"{success_rate:>6.2f}%{'':<12} | {success}/{total}{'':<12} | " + overall_row.append(f"{success_rate:.2f}%") + overall_row.append(f"{success}/{total}") else: - row += f"{'N/A':<19} | {'0/0':<19} | " + overall_row.append("N/A") + overall_row.append("0/0") else: - row += f"{'N/A':<19} | {'N/A':<19} | " - - print(row) + overall_row.append("N/A") + overall_row.append("N/A") + + table.add_row(overall_row) + print(table) def generate_item_blocked_data(experiments_root): # Organize data by item and blocked agent count item_blocked_data = defaultdict(lambda: defaultdict(lambda: {"success": 0, "total": 0})) - # Track skipped experiments - skipped_experiments = [] + # Keep track of ignored tasks + ignored_tasks = [] # Populate the data structure for exp_dir in os.listdir(experiments_root): @@ -304,7 +321,7 @@ def generate_item_blocked_data(experiments_root): else: blocked_key = "0 agent(s)" - # Check if the task was successful + # Check if the task was successful and if score information exists is_successful = False score_found = False full_exp_path = os.path.join(experiments_root, exp_dir) @@ -318,103 +335,90 @@ def generate_item_blocked_data(experiments_root): if "turns" in agent_data: for turn in agent_data["turns"]: if turn.get("role") == "system" and "content" in turn: - if isinstance(turn["content"], str) and "Task ended with score" in turn["content"]: + if isinstance(turn["content"], str) and "Task ended with score : " in turn["content"]: score_found = True if "Task ended with score : 1" in turn["content"]: is_successful = True - break + break - if score_found: + if is_successful: break except: continue - # Skip experiments with no score information + # If no score information was found, skip this task if not score_found: - skipped_experiments.append(exp_dir) + ignored_tasks.append(exp_dir) continue - + # Update the item-blocked data for item in cooking_items: item_blocked_data[item][blocked_key]["total"] += 1 if is_successful: item_blocked_data[item][blocked_key]["success"] += 1 - return item_blocked_data, skipped_experiments + return item_blocked_data, ignored_tasks def main(): - base_dir = "experiments" - - # Get the model directories - all_model_dirs = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))] - gpt_dirs = [d for d in all_model_dirs if d.startswith("gpt-4o_30_cooking_tasks")] - claude_dirs = [d for d in all_model_dirs if d.startswith("claude-3-5-sonnet-latest_30_cooking_tasks")] - - if not gpt_dirs or not claude_dirs: - print("Error: Could not find both model directories. Please check your paths.") + # Define lists for model directories and corresponding model names + model_dirs = [ + "experiments/gpt-4o_2agent_NEW_cooking_tasks", + # "experiments/claude-3-5-sonnet_2agent_NEW_cooking_tasks", + # "experiments/claude-3-5-sonnet_3agent_NEW_cooking_tasks", + "experiments/gpt-4o_3agent_NEW_cooking_tasks", + # "experiments/1_claude-3-5-sonnet_4agents_NEW_cooking_tasks", + "experiments/gpt-4o_4agents_NEW_cooking_tasks", + "experiments/gpt-4o_5agents_NEW_cooking_tasks", + # "experiments/" + ] + model_names = [ + "GPT-4o-2agent", + # "Claude-3.5-2agent", + "GPT-4o-3agent", + # "Claude-3.5-3agent", + # "Claude-3.5-4agent", + "GPT-4o-4agent", + "GPT-4o-5agent", + # "Another-Model" + ] + + # Ensure both lists are of the same size + if len(model_dirs) != len(model_names): + print("Error: The number of model directories and model names must be the same.") return - - # Use the first directory found for each model - gpt_dir = os.path.join(base_dir, gpt_dirs[0]) - claude_dir = os.path.join(base_dir, claude_dirs[0]) - - print(f"Analyzing GPT-4o experiments in: {gpt_dir}") - print(f"Analyzing Claude-3.5-Sonnet experiments in: {claude_dir}") - + # Analyze each model directory - gpt_blocked_results, gpt_item_results, gpt_unique_items, gpt_skipped = analyze_experiments(gpt_dir, "GPT-4o") - claude_blocked_results, claude_item_results, claude_unique_items, claude_skipped = analyze_experiments(claude_dir, "Claude-3.5") - - # Combine unique cooking items - all_cooking_items = gpt_unique_items.union(claude_unique_items) - - # Generate item-blocked data for each model - gpt_item_blocked_data, gpt_skipped_detailed = generate_item_blocked_data(gpt_dir) - claude_item_blocked_data, claude_skipped_detailed = generate_item_blocked_data(claude_dir) - - # Create model comparison data structures - models_blocked_results = { - "GPT-4o": gpt_blocked_results, - "Claude-3.5": claude_blocked_results - } - - models_item_results = { - "GPT-4o": gpt_item_results, - "Claude-3.5": claude_item_results - } - - models_data = { - "GPT-4o": (gpt_blocked_results, gpt_item_results, gpt_item_blocked_data), - "Claude-3.5": (claude_blocked_results, claude_item_results, claude_item_blocked_data) - } - + models_blocked_results = {} + models_item_results = {} + all_cooking_items = set() + total_ignored_tasks = 0 + + for model_dir, model_name in zip(model_dirs, model_names): + print(f"Analyzing {model_name} experiments in: {model_dir}") + + blocked_results, item_results, unique_items, ignored_tasks = analyze_experiments(model_dir, model_name) + + models_blocked_results[model_name] = blocked_results + models_item_results[model_name] = item_results + all_cooking_items.update(unique_items) + total_ignored_tasks += len(ignored_tasks) + + if ignored_tasks: + print(f" - {model_name}: Ignored {len(ignored_tasks)} tasks with no score information.") + + # Print summary of ignored tasks + if total_ignored_tasks > 0: + print(f"\nTotal ignored tasks (missing score information): {total_ignored_tasks}") + # Print the comparison tables print_model_comparison_blocked(models_blocked_results) print_model_comparison_items(models_item_results, all_cooking_items) - print_model_comparison_items_by_blocked(models_data, all_cooking_items) - + # Print overall statistics print("\nUnique Cooking Items Found:") print("=" * 60) print(", ".join(sorted(all_cooking_items))) print(f"Total unique items: {len(all_cooking_items)}") - - # Print skipped experiment information - print("\nSkipped Experiments (No Score Information):") - print("=" * 60) - print(f"GPT-4o: {len(gpt_skipped)} experiments skipped") - print(f"Claude-3.5: {len(claude_skipped)} experiments skipped") - - if gpt_skipped or claude_skipped: - print("\nSkipped experiment directories:") - if gpt_skipped: - print("GPT-4o:") - for exp in gpt_skipped: - print(f" - {exp}") - if claude_skipped: - print("Claude-3.5:") - for exp in claude_skipped: - print(f" - {exp}") if __name__ == "__main__": main() \ No newline at end of file