Added files for filtering easy test tasks (30) and training tasks (500)

2025-08-04 14:25:43 +02:00 · 2025-03-22 21:37:50 -07:00 · 2025-03-22 21:37:50 -07:00 · 4c5320eddb
commit 4c5320eddb
parent 9c674e6019
2 changed files with 328 additions and 0 deletions
--- a/tasks/construction_tasks/filter_easy_tasks.py
+++ b/tasks/construction_tasks/filter_easy_tasks.py
@ -0,0 +1,241 @@
 import json
 import re
 import statistics
 import random
 import os
 def extract_difficulty(task_name):
    """Extract difficulty parameters from the task name."""
    match = re.search(r'materials_(\d+)_rooms_(\d+)_window_(\d+)_carpet_(\d+)_variant_\d+', task_name)
    if match:
        return tuple(map(int, match.groups()))  # (m, r, w, c)
    return (0, 0, 0, 0)  # Default to lowest difficulty if not found
 def calculate_difficulty_score(task_name, task, alpha=1.0, beta=3.0):
    """Compute a difficulty score based on parameters."""
    m, r, w, c = extract_difficulty(task_name)
    num_levels = len(task.get("blueprint", {}).get("levels", []))
    # Higher values mean more difficulty
    score = (m*4 + r*10 + w*2 + c*1)
    return score
 def process_json(file_path, output_path, alpha=1.0, beta=3.0):
    """Process the JSON file to count tasks, quantify difficulty, and filter easiest 30."""
    with open(file_path, 'r') as f:
        data = json.load(f)
    # Count total tasks
    total_tasks = len(data)
    print(f"Total tasks: {total_tasks}")
    # Compute difficulty scores for tasks with at least 3 levels
    task_difficulties = []
    filtered_out = 0
    for task_name, task_details in data.items():
        num_levels = len(task_details.get("blueprint", {}).get("levels", []))
        # Skip tasks with fewer than 3 levels
        if num_levels < 3:
            filtered_out += 1
            continue
        score = calculate_difficulty_score(task_name, task_details, alpha, beta)
        task_difficulties.append((task_name, score, task_details))
    print(f"Filtered out {filtered_out} tasks with fewer than 3 levels")
    print(f"Remaining tasks after filtering: {len(task_difficulties)}")
    # Calculate statistics on the filtered tasks
    if task_difficulties:
        difficulty_scores = [score for _, score, _ in task_difficulties]
        stats = {
            "mean": statistics.mean(difficulty_scores),
            "median": statistics.median(difficulty_scores),
            "min": min(difficulty_scores),
            "max": max(difficulty_scores),
        }
        print(f"Difficulty Statistics for Overall Tasks: {stats}")
    else:
        stats = {"mean": 0, "median": 0, "min": 0, "max": 0}
        print("No tasks remaining after filtering!")
    # Sort tasks by difficulty (ascending)
    task_difficulties.sort(key=lambda x: x[1])
    # Get the 30 easiest tasks (or all if less than 30)
    num_tasks_to_select = min(30, len(task_difficulties))
    easiest_tasks = {task[0]: task[2] for task in task_difficulties[:num_tasks_to_select]}
    # Difficulty scores of the easiest tasks
    easiest_difficulty_scores = [score for _, score, _ in task_difficulties[:num_tasks_to_select]]
    easiest_stats = {
        "mean": statistics.mean(easiest_difficulty_scores),
        "median": statistics.median(easiest_difficulty_scores),
        "min": min(easiest_difficulty_scores),
        "max": max(easiest_difficulty_scores),
    }
    print(f"Difficulty Statistics for Easiest Tasks: {easiest_stats}")
    # Add a group by of all unique (m, r, w, c) combinations in the easiest tasks
    unique_difficulties = {}
    for task_name, _, task_details in task_difficulties[:num_tasks_to_select]:
        m, r, w, c = extract_difficulty(task_name)
        unique_difficulties[(m, r, w, c)] = unique_difficulties.get((m, r, w, c), 0) + 1
    print(f"Unique (m, r, w, c) combinations in the easiest tasks:")
    for difficulty, count in unique_difficulties.items():
        print(f"  {difficulty}: {count} tasks")
    # Add statistics to output
    output_data = easiest_tasks
    # Save to output file
    with open(output_path, 'w') as f:
        json.dump(output_data, f, indent=4)
    print(f"Saved {num_tasks_to_select} easiest tasks with statistics to {output_path}")
 def sample_tasks_with_distribution(file_path, output_path):
    """
    Sample tasks with a specific distribution:
    - 3 tasks for each of the 9 possibilities of (m,r) where 0 <= m <= 2 and 0 <= r <= 2
    - Random (w,c) between 0 and 1 for the above tasks
    - 2 additional tasks from (m,r,w,c) = (0,0,0,0)
    - 1 additional task from (m,r,w,c) = (1,0,0,0)
    """
    with open(file_path, 'r') as f:
        data = json.load(f)
    # Filter tasks with at least 3 levels
    valid_tasks = {}
    for task_name, task_details in data.items():
        num_levels = len(task_details.get("blueprint", {}).get("levels", []))
        if num_levels >= 3:
            valid_tasks[task_name] = task_details
    # print(f"Total valid tasks: {len(valid_tasks)}")
    # Categorize tasks by their (m,r,w,c) values
    tasks_by_params = {}
    for task_name, task_details in valid_tasks.items():
        m, r, w, c = extract_difficulty(task_name)
        key = (m, r, w, c)
        if key not in tasks_by_params:
            tasks_by_params[key] = []
        tasks_by_params[key].append((task_name, task_details))
    # # Print available combinations
    # print("Available (m,r,w,c) combinations:")
    # for params, tasks in tasks_by_params.items():
    #     print(f"  {params}: {len(tasks)} tasks")
    # Sample tasks according to the distribution
    sampled_tasks = {}
    already_sampled = set()
    # 1. Sample 3 tasks for each (m,r) where 0 <= m <= 2 and 0 <= r <= 2
    for m in range(3):
        for r in range(3):
            # Find all tasks with the current (m,r) and w,c between 0 and 1
            candidates = []
            for params, tasks in tasks_by_params.items():
                if params[0] == m and params[1] == r and params[2] <= 1 and params[3] <= 1:
                    candidates.extend(tasks)
            # Sample 3 tasks if possible
            if len(candidates) >= 3:
                sampled = random.sample(candidates, 3)
                for task_name, task_details in sampled:
                    if task_name not in already_sampled:
                        sampled_tasks[task_name] = task_details
                        already_sampled.add(task_name)
            else:
                print(f"Warning: Not enough tasks for (m={m}, r={r}) with w,c <= 1. Found {len(candidates)}.")
                # Add all available
                for task_name, task_details in candidates:
                    if task_name not in already_sampled:
                        sampled_tasks[task_name] = task_details
                        already_sampled.add(task_name)
    # 2. Add 2 tasks with (m,r,w,c) = (0,0,0,0)
    zero_zero_zero_zero = tasks_by_params.get((0,0,0,0), [])
    zero_zero_zero_zero = [t for t in zero_zero_zero_zero if t[0] not in already_sampled]
    if len(zero_zero_zero_zero) >= 2:
        additional = random.sample(zero_zero_zero_zero, 2)
        for task_name, task_details in additional:
            sampled_tasks[task_name] = task_details
            already_sampled.add(task_name)
    else:
        print(f"Warning: Not enough tasks for (0,0,0,0). Found {len(zero_zero_zero_zero)}.")
        for task_name, task_details in zero_zero_zero_zero:
            sampled_tasks[task_name] = task_details
            already_sampled.add(task_name)
    # 3. Add 1 task with (m,r,w,c) = (1,0,0,0)
    one_zero_zero_zero = tasks_by_params.get((1,0,0,0), [])
    one_zero_zero_zero = [t for t in one_zero_zero_zero if t[0] not in already_sampled]
    if len(one_zero_zero_zero) >= 1:
        additional = random.sample(one_zero_zero_zero, 1)
        for task_name, task_details in additional:
            sampled_tasks[task_name] = task_details
            already_sampled.add(task_name)
    else:
        print(f"Warning: Not enough tasks for (1,0,0,0). Found {len(one_zero_zero_zero)}.")
        for task_name, task_details in one_zero_zero_zero:
            sampled_tasks[task_name] = task_details
            already_sampled.add(task_name)
    # Print summary of sampled tasks
    print(f"\nTotal sampled tasks: {len(sampled_tasks)}")
    # Count tasks by their (m,r) values
    distribution = {}
    for task_name in sampled_tasks:
        m, r, w, c = extract_difficulty(task_name)
        key = (m, r)
        if key not in distribution:
            distribution[key] = []
        distribution[key].append((w, c))
    print("\nDistribution of sampled tasks:")
    for mr, wc_list in distribution.items():
        print(f"  (m={mr[0]}, r={mr[1]}): {len(wc_list)} tasks")
        for wc in wc_list:
            print(f"    (w={wc[0]}, c={wc[1]})")
    # Check for duplicates in sampled tasks
    if len(sampled_tasks) != len(set(sampled_tasks.keys())):
        print("\nWARNING: Duplicate tasks detected!")
        # Find the duplicates
        task_counts = {}
        for task_name in sampled_tasks.keys():
            task_counts[task_name] = task_counts.get(task_name, 0) + 1
        duplicates = [task for task, count in task_counts.items() if count > 1]
        print(f"Duplicate tasks: {duplicates}")
    else:
        print("\nVerification: No duplicates found in the sampled tasks.")
    # Save to output file
    with open(output_path, 'w') as f:
        json.dump(sampled_tasks, f, indent=4)
    print(f"\nSaved {len(sampled_tasks)} distributed tasks to {output_path}")
 # Example usage:
 # process_json('test/2agents.json', 'test/2_agents_easiest_tasks.json', alpha=1.0, beta=3.0)
 # Iterate through files in tasks folder
 tasks_dir = 'test'
 for filename in os.listdir(tasks_dir):
    if filename.endswith('agents.json'):
        input_path = os.path.join(tasks_dir, filename)
        # Create output filename by replacing .json with _distributed_tasks.json
        output_filename = filename.replace('.json', '_distributed_tasks.json')
        output_path = os.path.join(tasks_dir, output_filename)
        print(f"\nProcessing {filename}...")
        sample_tasks_with_distribution(input_path, output_path)
--- a/tasks/construction_tasks/filter_training_tasks.py
+++ b/tasks/construction_tasks/filter_training_tasks.py
@ -0,0 +1,87 @@
 import json
 import re
 import random
 import os
 from collections import defaultdict
 def extract_difficulty(task_name):
    """Extract difficulty parameters from the task name."""
    match = re.search(r'materials_(\d+)_rooms_(\d+)_window_(\d+)_carpet_(\d+)_variant_\d+', task_name)
    if match:
        return tuple(map(int, match.groups()))  # (m, r, w, c)
    return (0, 0, 0, 0)  # Default if not found
 def filter_and_sample_tasks(file_path, output_path):
    """Filters, samples, and saves 500 unique tasks based on given criteria."""
    with open(file_path, 'r') as f:
        data = json.load(f)
    total_tasks = len(data)
    print(f"\nProcessing file: {file_path}")
    print(f"Total available tasks: {total_tasks}")
    valid_tasks = {}
    # Filter tasks with at least 3 levels
    for task_name, task_details in data.items():
        num_levels = len(task_details.get("blueprint", {}).get("levels", []))
        if num_levels >= 3:
            valid_tasks[task_name] = task_details
    print(f"Tasks with at least 3 levels: {len(valid_tasks)}")
    # Organize tasks by difficulty parameters (m, r, w, c)
    tasks_by_params = defaultdict(list)
    for task_name, task_details in valid_tasks.items():
        key = extract_difficulty(task_name)
        tasks_by_params[key].append((task_name, task_details))
    # Sort keys in increasing order
    sorted_keys = sorted(tasks_by_params.keys())
    sampled_tasks = {}
    total_selected = 0
    sampled_task_counts = defaultdict(int)
    # Pick tasks sequentially until 500 are collected
    for key in sorted_keys:
        if total_selected >= 500:
            break
        if key in tasks_by_params:
            candidates = tasks_by_params[key]
            for task_name, task_details in candidates:
                if total_selected < 500:
                    sampled_tasks[task_name] = task_details
                    sampled_task_counts[key] += 1  # Keep the key as a tuple
                    total_selected += 1
                else:
                    break
    print(f"\nTotal sampled tasks: {len(sampled_tasks)}")
    # Print task count per (m, r, w, c) tuple
    print("\nTask count per (m, r, w, c):")
    for key, count in sorted(sampled_task_counts.items()):
        print(f"{key}: {count}")
    # Randomly shuffle the tasks before saving
    shuffled_tasks = list(sampled_tasks.items())
    random.shuffle(shuffled_tasks)
    final_tasks = dict(shuffled_tasks)
    # Save sampled tasks to JSON
    with open(output_path, 'w') as f:
        json.dump(final_tasks, f, indent=4)
    print(f"\nSaved {len(final_tasks)} tasks to {output_path}")
 # Process all relevant files
 tasks_dir = 'train'
 all_filenames = [f for f in os.listdir(tasks_dir) if f.endswith('agents.json')]
 all_filenames.sort()
 for i, filename in enumerate(all_filenames):
    input_path = os.path.join(tasks_dir, filename)
    output_filename = filename.replace('.json', '_sampled_tasks_for_training.json')
    output_path = os.path.join(tasks_dir, output_filename)
    filter_and_sample_tasks(input_path, output_path)