Updated analyze scripts to perform model comparison

2025-08-03 05:45:36 +02:00 · 2025-03-23 14:23:10 -07:00 · 2025-03-23 14:23:10 -07:00 · 76de807a46
commit 76de807a46
parent 4c5320eddb
2 changed files with 415 additions and 173 deletions
--- a/analyze_construction_tasks.py
+++ b/analyze_construction_tasks.py
@ -4,86 +4,170 @@ from collections import defaultdict
 from prettytable import PrettyTable
 import re

-def extract_success_scores(root_dir):
-    task_scores = {}  # Stores task-wise scores
-    material_groups = defaultdict(list)
-    room_groups = defaultdict(list)
+def extract_success_scores(folders, model_names):
+    assert len(folders) == len(model_names), "Folders and model names lists must have the same length."
+    
+    all_task_scores = defaultdict(dict)  # Stores task-wise scores per model
+    zero_score_tasks = defaultdict(list)  # Stores tasks with 0 score per model
+    null_score_tasks = defaultdict(list)  # Stores tasks with null score per model
+    material_groups = defaultdict(lambda: defaultdict(list))
+    room_groups = defaultdict(lambda: defaultdict(list))
+    material_room_groups = defaultdict(lambda: defaultdict(list))
+    overall_scores = defaultdict(list)  # New dict to store all scores for each model
    
-    # Regex pattern to extract material and room numbers
    pattern = re.compile(r"materials_(\d+)_rooms_(\d+)")
-
-    # Iterate through each task folder
-    for task_folder in os.listdir(root_dir):
-        task_path = os.path.join(root_dir, task_folder)
-        if os.path.isdir(task_path):
-            logs_found = False  # Flag to track if logs exist
-            
-            # Check for JSON files
-            for file_name in os.listdir(task_path):
-                if file_name.endswith(".json"): 
-                    logs_found = True  # JSON file exists
-                    file_path = os.path.join(task_path, file_name)
-                    
-                    # Read JSON file
-                    try:
-                        with open(file_path, 'r') as file:
-                            data = json.load(file)
-                            
-                            # Extract success score from the last system message
-                            for turn in reversed(data.get("turns", [])):
-                                if turn["role"] == "system" and "Task ended with score" in turn["content"]:
-                                    score = float(turn["content"].split(":")[-1].strip())
-                                    task_scores[task_folder] = score  # Store per-task score
-                                    break  # Stop searching if found
-                            
-                            # Stop checking other files in the folder if score is found
-                            if task_folder in task_scores:
+    
+    for root_dir, model_name in zip(folders, model_names):
+        for task_folder in os.listdir(root_dir):
+            task_path = os.path.join(root_dir, task_folder)
+            if os.path.isdir(task_path):
+                logs_found = False
+                score_found = False
+                
+                for file_name in os.listdir(task_path):
+                    if file_name.endswith(".json"): 
+                        logs_found = True
+                        file_path = os.path.join(task_path, file_name)
+                        
+                        try:
+                            with open(file_path, 'r') as file:
+                                data = json.load(file)
+                                
+                                for turn in reversed(data.get("turns", [])):
+                                    if turn["role"] == "system" and "Task ended with score" in turn["content"]:
+                                        score = float(turn["content"].split(":")[-1].strip())
+                                        all_task_scores[task_folder][model_name] = score
+                                        overall_scores[model_name].append(score)  # Add to overall scores
+                                        score_found = True
+                                        
+                                        if score == 0:
+                                            zero_score_tasks[model_name].append(task_folder)
+                                        break 
+                                
+                            if score_found:
                                break 
-                    except Exception as e:
-                        print(f"Error reading {file_path}: {e}")
-            
-            # If no logs were found, print a message
-            if not logs_found:
-                print(f"No log files found in {task_folder}")
-
-    # Group scores by material and room
-    for task, score in task_scores.items():
+                        except Exception as e:
+                            print(f"Error reading {file_path}: {e}")
+                
+                if logs_found and not score_found:
+                    # Score not found but logs exist - mark as null
+                    all_task_scores[task_folder][model_name] = None
+                    null_score_tasks[model_name].append(task_folder)
+                
+                if not logs_found:
+                    print(f"No log files found in {task_folder}")
+    
+    # Calculate model completion rates (ignore null scores)
+    model_completion_rates = {}
+    for model_name in model_names:
+        valid_tasks = [task for task in all_task_scores.keys() if model_name in all_task_scores[task] and all_task_scores[task][model_name] is not None]
+        total_tasks = len(valid_tasks)
+        completed_tasks = len([task for task in valid_tasks if all_task_scores[task][model_name] > 0])
+        model_completion_rates[model_name] = (completed_tasks / total_tasks) if total_tasks > 0 else 0
+    
+    # Process task scores into groups (ignore null and 0 scores)
+    for task, model_scores in all_task_scores.items():
        match = pattern.search(task)
        if match:
-            material = int(match.group(1))  # Extract material number
-            room = int(match.group(2))  # Extract room number
-            material_groups[material].append(score)
-            room_groups[room].append(score)
-        else:
-            print(f"Warning: Task folder '{task}' does not match expected format.")
-
-    # Calculate average scores
+            material = int(match.group(1))
+            room = int(match.group(2))
+            
+            for model, score in model_scores.items():
+                if score is not None and score > 0:  # Ignore null and 0 scores
+                    material_groups[material][model].append(score)
+                    room_groups[room][model].append(score)
+                    material_room_groups[(material, room)][model].append(score)
+    
    def calculate_average(group):
-        return {key: sum(values) / len(values) for key, values in group.items()}
-
+        return {key: {model: sum(scores) / len(scores) for model, scores in models.items() if scores} 
+                for key, models in group.items() if models}
+    
    avg_material_scores = calculate_average(material_groups)
    avg_room_scores = calculate_average(room_groups)
-
-    # Display results using PrettyTable
-    def display_table(title, data):
-        table = PrettyTable(["Category", "Average Score"])
-        for key, value in sorted(data.items()):
-            table.add_row([key, round(value, 2)])
+    avg_material_room_scores = calculate_average(material_room_groups)
+    
+    def display_table(title, data, tuple_keys=False):
+        table = PrettyTable(["Category"] + model_names)
+        for key, model_scores in sorted(data.items()):
+            key_display = key if not tuple_keys else f"({key[0]}, {key[1]})"
+            row = [key_display] + [round(model_scores.get(model, 0), 2) for model in model_names]
+            table.add_row(row)
        print(f"\n{title}")
        print(table)
-
+    
    def display_task_scores():
-        table = PrettyTable(["Task", "Success Score"])
-        for task, score in sorted(task_scores.items()):
-            table.add_row([task, round(score, 2)])
+        table = PrettyTable(["Task"] + model_names)
+        for task in sorted(all_task_scores.keys()):
+            row = [task]
+            for model in model_names:
+                score = all_task_scores[task].get(model)
+                if score is None:
+                    row.append("null")
+                else:
+                    row.append(round(score, 2))
+            table.add_row(row)
        print("\nTask-wise Success Scores")
        print(table)
-
-    # Print all tables
+    
+    def display_zero_and_null_score_tasks():
+        for model in model_names:
+            if zero_score_tasks[model]:
+                table = PrettyTable([f"{model} - Tasks with 0 Score"])
+                for task in zero_score_tasks[model]:
+                    table.add_row([task])
+                print(f"\n{model} - Tasks with 0 Success Score")
+                print(table)
+            
+            if null_score_tasks[model]:
+                table = PrettyTable([f"{model} - Tasks with Null Score"])
+                for task in null_score_tasks[model]:
+                    table.add_row([task])
+                print(f"\n{model} - Tasks with Null Success Score")
+                print(table)
+    
+    def display_overall_averages():
+        table = PrettyTable(["Metric"] + model_names)
+        
+        # Overall average score (including zeros, excluding nulls)
+        row_with_zeros = ["Average Score (All Tasks)"]
+        for model in model_names:
+            valid_scores = [s for s in overall_scores[model] if s is not None]
+            avg = sum(valid_scores) / len(valid_scores) if valid_scores else 0
+            row_with_zeros.append(round(avg, 2))
+        table.add_row(row_with_zeros)
+        
+        # Overall average score (excluding zeros and nulls)
+        row_without_zeros = ["Average Score (Completed Tasks)"]
+        for model in model_names:
+            completed_scores = [s for s in overall_scores[model] if s is not None and s > 0]
+            avg = sum(completed_scores) / len(completed_scores) if completed_scores else 0
+            row_without_zeros.append(round(avg, 2))
+        table.add_row(row_without_zeros)
+        
+        # Task completion rate
+        completion_row = ["Task Completion Rate (%)"]
+        for model in model_names:
+            completion_row.append(round(model_completion_rates[model] * 100, 2))
+        table.add_row(completion_row)
+        
+        # Total number of tasks (excluding nulls)
+        task_count_row = ["Total Tasks"]
+        for model in model_names:
+            valid_tasks = [task for task in all_task_scores.keys() if model in all_task_scores[task] and all_task_scores[task][model] is not None]
+            task_count_row.append(len(valid_tasks))
+        table.add_row(task_count_row)
+        
+        print("\nOverall Performance Metrics")
+        print(table)
+    
+    display_overall_averages()  # Display overall averages first
    display_task_scores()
-    display_table("Average Success Score by Material (Grouped by Number)", avg_material_scores)
-    display_table("Average Success Score by Room (Grouped by Number)", avg_room_scores)
+    display_zero_and_null_score_tasks()
+    display_table("Average Success Score by Material", avg_material_scores)
+    display_table("Average Success Score by Room", avg_room_scores)
+    display_table("Average Success Score by (Material, Room) Tuples", avg_material_room_scores, tuple_keys=True)

-# Example usage (replace 'root_directory' with actual path)
-root_directory = "experiments/exp_03-22_19-29"
-extract_success_scores(root_directory)
+# Example usage
+folders = ["experiments/gpt-4o_construction_tasks", "experiments/exp_03-23_12-31"]
+model_names = ["GPT-4o","Claude 3.5 sonnet"]
+extract_success_scores(folders, model_names)
--- a/analyze_cooking_tasks.py
+++ b/analyze_cooking_tasks.py
@ -20,15 +20,11 @@ def extract_cooking_items(exp_dir):
    
    return items

-def analyze_experiments(root_dir):
+def analyze_experiments(root_dir, model_name):
    # Store results by number of blocked agents
    blocked_access_results = defaultdict(lambda: {
        "success": 0, 
-        "total": 0,
-        "cake_success": 0,
-        "cake_total": 0,
-        "non_cake_success": 0,
-        "non_cake_total": 0
+        "total": 0
    })
    
    # Store results by cooking item
@ -51,9 +47,6 @@ def analyze_experiments(root_dir):
        # Add to unique items set
        all_cooking_items.update(cooking_items)
        
-        # Check if experiment involves cake
-        has_cake = any(item == "cake" for item in cooking_items)
-        
        # Extract blocked access information from directory name
        blocked_access_match = re.search(r'blocked_access_([0-9_]+)$', exp_dir)
        
@ -104,119 +97,284 @@ def analyze_experiments(root_dir):
            if is_successful:
                cooking_item_results[item]["success"] += 1
        
-        # Update the appropriate blocked access counters
-        # First update the category-specific counters
-        if has_cake:
-            blocked_access_results[blocked_key]["cake_total"] += 1
-            if is_successful:
-                blocked_access_results[blocked_key]["cake_success"] += 1
-        else:
-            blocked_access_results[blocked_key]["non_cake_total"] += 1
-            if is_successful:
-                blocked_access_results[blocked_key]["non_cake_success"] += 1
-            
-            # Only count non-cake experiments in the main totals
-            blocked_access_results[blocked_key]["total"] += 1
-            if is_successful:
-                blocked_access_results[blocked_key]["success"] += 1
+        # Update the blocked access counters
+        blocked_access_results[blocked_key]["total"] += 1
+        if is_successful:
+            blocked_access_results[blocked_key]["success"] += 1
    
    return blocked_access_results, cooking_item_results, all_cooking_items

-def print_blocked_results(results):
-    print("\nExperiment Results by Number of Agents with Blocked Access (Excluding Cake Experiments):")
-    print("=" * 80)
-    print(f"{'Blocked Agents':<15} | {'Success Rate':<15} | {'Success/Total':<15} | {'Cake Tasks':<15} | {'Non-Cake Tasks':<15}")
-    print("-" * 80)
+def print_model_comparison_blocked(models_results):
+    print("\nModel Comparison by Number of Agents with Blocked Access:")
+    print("=" * 100)
    
-    # Calculate totals
-    total_success = 0
-    total_experiments = 0
-    total_cake = 0
-    total_non_cake = 0
+    # Get all possible blocked access keys
+    all_blocked_keys = set()
+    for model_results in models_results.values():
+        all_blocked_keys.update(model_results.keys())
    
-    # Sort by number of blocked agents
-    for key in sorted(results.keys(), key=lambda x: int(x.split()[0])):
-        success = results[key]["success"]
-        total = results[key]["total"]
-        cake_total = results[key]["cake_total"]
-        non_cake_total = results[key]["non_cake_total"]
+    # Sort the keys
+    sorted_keys = sorted(all_blocked_keys, key=lambda x: int(x.split()[0]))
+    
+    # Create the header
+    header = f"{'Blocked Agents':<15} | "
+    for model_name in models_results.keys():
+        header += f"{model_name+' Success Rate':<20} | {model_name+' Success/Total':<20} | "
+    print(header)
+    print("-" * 100)
+    
+    # Calculate and print the results for each blocked key
+    model_totals = {model: {"success": 0, "total": 0} for model in models_results.keys()}
+    
+    for key in sorted_keys:
+        row = f"{key:<15} | "
        
-        # Verify that non_cake_total matches total
-        if non_cake_total != total:
-            print(f"Warning: Non-cake total ({non_cake_total}) doesn't match the total ({total}) for {key}")
-        
-        total_success += success
-        total_experiments += total
-        total_cake += cake_total
-        total_non_cake += non_cake_total
+        for model_name, model_results in models_results.items():
+            if key in model_results:
+                success = model_results[key]["success"]
+                total = model_results[key]["total"]
+                
+                model_totals[model_name]["success"] += success
+                model_totals[model_name]["total"] += total
+                
+                success_rate = (success / total * 100) if total > 0 else 0
+                row += f"{success_rate:>6.2f}%{'':<12} | {success}/{total}{'':<12} | "
+            else:
+                row += f"{'N/A':<19} | {'N/A':<19} | "
        
+        print(row)
+    
+    # Print the overall results
+    print("-" * 100)
+    row = f"{'Overall':<15} | "
+    
+    for model_name, totals in model_totals.items():
+        success = totals["success"]
+        total = totals["total"]
        success_rate = (success / total * 100) if total > 0 else 0
-        
-        print(f"{key:<15} | {success_rate:>6.2f}%        | {success}/{total:<13} | {cake_total:<15} | {non_cake_total:<15}")
+        row += f"{success_rate:>6.2f}%{'':<12} | {success}/{total}{'':<12} | "
    
-    # Calculate overall success rate (excluding cake experiments)
-    overall_success_rate = (total_success / total_experiments * 100) if total_experiments > 0 else 0
-    
-    print("-" * 80)
-    print(f"{'Overall':<15} | {overall_success_rate:>6.2f}%        | {total_success}/{total_experiments:<13} | {total_cake:<15} | {total_non_cake:<15}")
-    
-    # Print cake experiment details
-    print("\nCake Experiment Details:")
-    print("=" * 60)
-    print(f"{'Blocked Agents':<15} | {'Success Rate':<15} | {'Success/Total':<15}")
-    print("-" * 60)
-    
-    cake_total_success = 0
-    cake_total_experiments = 0
-    
-    for key in sorted(results.keys(), key=lambda x: int(x.split()[0])):
-        cake_success = results[key]["cake_success"]
-        cake_total = results[key]["cake_total"]
-        
-        cake_total_success += cake_success
-        cake_total_experiments += cake_total
-        
-        cake_success_rate = (cake_success / cake_total * 100) if cake_total > 0 else 0
-        
-        print(f"{key:<15} | {cake_success_rate:>6.2f}%        | {cake_success}/{cake_total}")
-    
-    cake_overall_success_rate = (cake_total_success / cake_total_experiments * 100) if cake_total_experiments > 0 else 0
-    
-    print("-" * 60)
-    print(f"{'Overall':<15} | {cake_overall_success_rate:>6.2f}%        | {cake_total_success}/{cake_total_experiments}")
+    print(row)

-def print_cooking_items(cooking_items):
-    print("\nUnique Cooking Items Found:")
-    print("=" * 60)
-    print(", ".join(sorted(cooking_items)))
-    print(f"Total unique items: {len(cooking_items)}")
-
-def print_item_results(item_results):
-    print("\nExperiment Results by Cooking Item:")
-    print("=" * 60)
-    print(f"{'Cooking Item':<20} | {'Success Rate':<15} | {'Success/Total':<15}")
-    print("-" * 60)
+def print_model_comparison_items(models_item_results, all_cooking_items):
+    print("\nModel Comparison by Cooking Item:")
+    print("=" * 100)
    
-    # Sort by item name
-    for item in sorted(item_results.keys()):
-        success = item_results[item]["success"]
-        total = item_results[item]["total"]
+    # Create the header
+    header = f"{'Cooking Item':<20} | "
+    for model_name in models_item_results.keys():
+        header += f"{model_name+' Success Rate':<20} | {model_name+' Success/Total':<20} | "
+    print(header)
+    print("-" * 100)
+    
+    # Calculate and print the results for each cooking item
+    model_totals = {model: {"success": 0, "total": 0} for model in models_item_results.keys()}
+    
+    for item in sorted(all_cooking_items):
+        row = f"{item:<20} | "
+        
+        for model_name, model_results in models_item_results.items():
+            if item in model_results:
+                success = model_results[item]["success"]
+                total = model_results[item]["total"]
+                
+                model_totals[model_name]["success"] += success
+                model_totals[model_name]["total"] += total
+                
+                success_rate = (success / total * 100) if total > 0 else 0
+                row += f"{success_rate:>6.2f}%{'':<12} | {success}/{total}{'':<12} | "
+            else:
+                row += f"{'N/A':<19} | {'N/A':<19} | "
+        
+        print(row)
+    
+    # Print the overall results
+    print("-" * 100)
+    row = f"{'Overall':<20} | "
+    
+    for model_name, totals in model_totals.items():
+        success = totals["success"]
+        total = totals["total"]
        success_rate = (success / total * 100) if total > 0 else 0
-        
-        print(f"{item:<20} | {success_rate:>6.2f}%        | {success}/{total}")
+        row += f"{success_rate:>6.2f}%{'':<12} | {success}/{total}{'':<12} | "
    
-    print("-" * 60)
+    print(row)
+
+def print_model_comparison_items_by_blocked(models_data, all_cooking_items):
+    print("\nDetailed Model Comparison by Cooking Item and Blocked Agent Count:")
+    print("=" * 120)
+    
+    # For each cooking item, create a comparison table by blocked agent count
+    for item in sorted(all_cooking_items):
+        print(f"\nResults for cooking item: {item}")
+        print("-" * 100)
+        
+        # Create the header
+        header = f"{'Blocked Agents':<15} | "
+        for model_name in models_data.keys():
+            header += f"{model_name+' Success Rate':<20} | {model_name+' Success/Total':<20} | "
+        print(header)
+        print("-" * 100)
+        
+        # Get all possible blocked agent counts
+        all_blocked_keys = set()
+        for model_name, model_data in models_data.items():
+            _, _, item_blocked_data = model_data
+            for blocked_key in item_blocked_data.get(item, {}).keys():
+                all_blocked_keys.add(blocked_key)
+        
+        # Sort the keys
+        sorted_keys = sorted(all_blocked_keys, key=lambda x: int(x.split()[0]))
+        
+        # Print each row
+        for blocked_key in sorted_keys:
+            row = f"{blocked_key:<15} | "
+            
+            for model_name, model_data in models_data.items():
+                _, _, item_blocked_data = model_data
+                
+                if item in item_blocked_data and blocked_key in item_blocked_data[item]:
+                    success = item_blocked_data[item][blocked_key]["success"]
+                    total = item_blocked_data[item][blocked_key]["total"]
+                    
+                    if total > 0:
+                        success_rate = (success / total * 100)
+                        row += f"{success_rate:>6.2f}%{'':<12} | {success}/{total}{'':<12} | "
+                    else:
+                        row += f"{'N/A':<19} | {'0/0':<19} | "
+                else:
+                    row += f"{'N/A':<19} | {'N/A':<19} | "
+            
+            print(row)
+        
+        # Print item summary for each model
+        print("-" * 100)
+        row = f"{'Overall':<15} | "
+        
+        for model_name, model_data in models_data.items():
+            _, item_results, _ = model_data
+            
+            if item in item_results:
+                success = item_results[item]["success"]
+                total = item_results[item]["total"]
+                
+                if total > 0:
+                    success_rate = (success / total * 100)
+                    row += f"{success_rate:>6.2f}%{'':<12} | {success}/{total}{'':<12} | "
+                else:
+                    row += f"{'N/A':<19} | {'0/0':<19} | "
+            else:
+                row += f"{'N/A':<19} | {'N/A':<19} | "
+        
+        print(row)
+
+def generate_item_blocked_data(experiments_root):
+    # Organize data by item and blocked agent count
+    item_blocked_data = defaultdict(lambda: defaultdict(lambda: {"success": 0, "total": 0}))
+    
+    # Populate the data structure
+    for exp_dir in os.listdir(experiments_root):
+        if not os.path.isdir(os.path.join(experiments_root, exp_dir)) or not exp_dir.startswith("multiagent_cooking_"):
+            continue
+        
+        # Extract cooking items
+        cooking_items = extract_cooking_items(exp_dir)
+        
+        # Extract blocked access information
+        blocked_access_match = re.search(r'blocked_access_([0-9_]+)$', exp_dir)
+        if blocked_access_match:
+            blocked_access_str = blocked_access_match.group(1)
+            num_blocked_agents = len(blocked_access_str.split('_'))
+            blocked_key = f"{num_blocked_agents} agent(s)"
+        else:
+            blocked_key = "0 agent(s)"
+        
+        # Check if the task was successful
+        is_successful = False
+        full_exp_path = os.path.join(experiments_root, exp_dir)
+        agent_files = [f for f in os.listdir(full_exp_path) if f.endswith(".json")]
+        
+        for agent_file in agent_files:
+            try:
+                with open(os.path.join(full_exp_path, agent_file), 'r') as f:
+                    agent_data = json.load(f)
+                    
+                if "turns" in agent_data:
+                    for turn in agent_data["turns"]:
+                        if turn.get("role") == "system" and "content" in turn:
+                            if isinstance(turn["content"], str) and "Task ended with score : 1" in turn["content"]:
+                                is_successful = True
+                                break
+                
+                if is_successful:
+                    break
+            except:
+                continue
+        
+        # Update the item-blocked data
+        for item in cooking_items:
+            item_blocked_data[item][blocked_key]["total"] += 1
+            if is_successful:
+                item_blocked_data[item][blocked_key]["success"] += 1
+    
+    return item_blocked_data

 def main():
-    # Update this path to your experiments directory
-    experiments_root = "../results/llama_70b_hells_kitchen_cooking_tasks"
+    base_dir = "experiments"
    
-    print(f"Analyzing experiments in: {os.path.abspath(experiments_root)}")
-    blocked_results, item_results, unique_items = analyze_experiments(experiments_root)
+    # Get the model directories
+    all_model_dirs = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]
+    gpt_dirs = [d for d in all_model_dirs if d.startswith("gpt-4o_30_cooking_tasks")]
+    claude_dirs = [d for d in all_model_dirs if d.startswith("llama_70b_30_cooking_tasks")]
    
-    print_blocked_results(blocked_results)
-    print_cooking_items(unique_items)
-    print_item_results(item_results)
+    if not gpt_dirs or not claude_dirs:
+        print("Error: Could not find both model directories. Please check your paths.")
+        return
+    
+    # Use the first directory found for each model
+    gpt_dir = os.path.join(base_dir, gpt_dirs[0])
+    claude_dir = os.path.join(base_dir, claude_dirs[0])
+    
+    print(f"Analyzing GPT-4o experiments in: {gpt_dir}")
+    print(f"Analyzing Claude-3.5-Sonnet experiments in: {claude_dir}")
+    
+    # Analyze each model directory
+    gpt_blocked_results, gpt_item_results, gpt_unique_items = analyze_experiments(gpt_dir, "GPT-4o")
+    claude_blocked_results, claude_item_results, claude_unique_items = analyze_experiments(claude_dir, "Claude-3.5")
+    
+    # Combine unique cooking items
+    all_cooking_items = gpt_unique_items.union(claude_unique_items)
+    
+    # Generate item-blocked data for each model
+    gpt_item_blocked_data = generate_item_blocked_data(gpt_dir)
+    claude_item_blocked_data = generate_item_blocked_data(claude_dir)
+    
+    # Create model comparison data structures
+    models_blocked_results = {
+        "GPT-4o": gpt_blocked_results,
+        "Claude-3.5": claude_blocked_results
+    }
+    
+    models_item_results = {
+        "GPT-4o": gpt_item_results,
+        "Claude-3.5": claude_item_results
+    }
+    
+    models_data = {
+        "GPT-4o": (gpt_blocked_results, gpt_item_results, gpt_item_blocked_data),
+        "Claude-3.5": (claude_blocked_results, claude_item_results, claude_item_blocked_data)
+    }
+    
+    # Print the comparison tables
+    print_model_comparison_blocked(models_blocked_results)
+    print_model_comparison_items(models_item_results, all_cooking_items)
+    print_model_comparison_items_by_blocked(models_data, all_cooking_items)
+    
+    # Print overall statistics
+    print("\nUnique Cooking Items Found:")
+    print("=" * 60)
+    print(", ".join(sorted(all_cooking_items)))
+    print(f"Total unique items: {len(all_cooking_items)}")

 if __name__ == "__main__":
    main()