mirror of
https://github.com/kolbytn/mindcraft.git
synced 2025-08-04 14:25:43 +02:00
Added files for filtering easy test tasks (30) and training tasks (500)
This commit is contained in:
parent
9c674e6019
commit
4c5320eddb
2 changed files with 328 additions and 0 deletions
241
tasks/construction_tasks/filter_easy_tasks.py
Normal file
241
tasks/construction_tasks/filter_easy_tasks.py
Normal file
|
@ -0,0 +1,241 @@
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import statistics
|
||||||
|
import random
|
||||||
|
import os
|
||||||
|
|
||||||
|
def extract_difficulty(task_name):
|
||||||
|
"""Extract difficulty parameters from the task name."""
|
||||||
|
match = re.search(r'materials_(\d+)_rooms_(\d+)_window_(\d+)_carpet_(\d+)_variant_\d+', task_name)
|
||||||
|
if match:
|
||||||
|
return tuple(map(int, match.groups())) # (m, r, w, c)
|
||||||
|
return (0, 0, 0, 0) # Default to lowest difficulty if not found
|
||||||
|
|
||||||
|
def calculate_difficulty_score(task_name, task, alpha=1.0, beta=3.0):
|
||||||
|
"""Compute a difficulty score based on parameters."""
|
||||||
|
m, r, w, c = extract_difficulty(task_name)
|
||||||
|
num_levels = len(task.get("blueprint", {}).get("levels", []))
|
||||||
|
|
||||||
|
# Higher values mean more difficulty
|
||||||
|
score = (m*4 + r*10 + w*2 + c*1)
|
||||||
|
return score
|
||||||
|
|
||||||
|
def process_json(file_path, output_path, alpha=1.0, beta=3.0):
|
||||||
|
"""Process the JSON file to count tasks, quantify difficulty, and filter easiest 30."""
|
||||||
|
with open(file_path, 'r') as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
# Count total tasks
|
||||||
|
total_tasks = len(data)
|
||||||
|
print(f"Total tasks: {total_tasks}")
|
||||||
|
|
||||||
|
# Compute difficulty scores for tasks with at least 3 levels
|
||||||
|
task_difficulties = []
|
||||||
|
filtered_out = 0
|
||||||
|
|
||||||
|
for task_name, task_details in data.items():
|
||||||
|
num_levels = len(task_details.get("blueprint", {}).get("levels", []))
|
||||||
|
|
||||||
|
# Skip tasks with fewer than 3 levels
|
||||||
|
if num_levels < 3:
|
||||||
|
filtered_out += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
score = calculate_difficulty_score(task_name, task_details, alpha, beta)
|
||||||
|
task_difficulties.append((task_name, score, task_details))
|
||||||
|
|
||||||
|
print(f"Filtered out {filtered_out} tasks with fewer than 3 levels")
|
||||||
|
print(f"Remaining tasks after filtering: {len(task_difficulties)}")
|
||||||
|
|
||||||
|
# Calculate statistics on the filtered tasks
|
||||||
|
if task_difficulties:
|
||||||
|
difficulty_scores = [score for _, score, _ in task_difficulties]
|
||||||
|
stats = {
|
||||||
|
"mean": statistics.mean(difficulty_scores),
|
||||||
|
"median": statistics.median(difficulty_scores),
|
||||||
|
"min": min(difficulty_scores),
|
||||||
|
"max": max(difficulty_scores),
|
||||||
|
}
|
||||||
|
print(f"Difficulty Statistics for Overall Tasks: {stats}")
|
||||||
|
else:
|
||||||
|
stats = {"mean": 0, "median": 0, "min": 0, "max": 0}
|
||||||
|
print("No tasks remaining after filtering!")
|
||||||
|
|
||||||
|
# Sort tasks by difficulty (ascending)
|
||||||
|
task_difficulties.sort(key=lambda x: x[1])
|
||||||
|
|
||||||
|
# Get the 30 easiest tasks (or all if less than 30)
|
||||||
|
num_tasks_to_select = min(30, len(task_difficulties))
|
||||||
|
easiest_tasks = {task[0]: task[2] for task in task_difficulties[:num_tasks_to_select]}
|
||||||
|
|
||||||
|
# Difficulty scores of the easiest tasks
|
||||||
|
easiest_difficulty_scores = [score for _, score, _ in task_difficulties[:num_tasks_to_select]]
|
||||||
|
easiest_stats = {
|
||||||
|
"mean": statistics.mean(easiest_difficulty_scores),
|
||||||
|
"median": statistics.median(easiest_difficulty_scores),
|
||||||
|
"min": min(easiest_difficulty_scores),
|
||||||
|
"max": max(easiest_difficulty_scores),
|
||||||
|
}
|
||||||
|
print(f"Difficulty Statistics for Easiest Tasks: {easiest_stats}")
|
||||||
|
|
||||||
|
# Add a group by of all unique (m, r, w, c) combinations in the easiest tasks
|
||||||
|
unique_difficulties = {}
|
||||||
|
for task_name, _, task_details in task_difficulties[:num_tasks_to_select]:
|
||||||
|
m, r, w, c = extract_difficulty(task_name)
|
||||||
|
unique_difficulties[(m, r, w, c)] = unique_difficulties.get((m, r, w, c), 0) + 1
|
||||||
|
|
||||||
|
print(f"Unique (m, r, w, c) combinations in the easiest tasks:")
|
||||||
|
for difficulty, count in unique_difficulties.items():
|
||||||
|
print(f" {difficulty}: {count} tasks")
|
||||||
|
|
||||||
|
# Add statistics to output
|
||||||
|
output_data = easiest_tasks
|
||||||
|
|
||||||
|
# Save to output file
|
||||||
|
with open(output_path, 'w') as f:
|
||||||
|
json.dump(output_data, f, indent=4)
|
||||||
|
|
||||||
|
print(f"Saved {num_tasks_to_select} easiest tasks with statistics to {output_path}")
|
||||||
|
|
||||||
|
def sample_tasks_with_distribution(file_path, output_path):
|
||||||
|
"""
|
||||||
|
Sample tasks with a specific distribution:
|
||||||
|
- 3 tasks for each of the 9 possibilities of (m,r) where 0 <= m <= 2 and 0 <= r <= 2
|
||||||
|
- Random (w,c) between 0 and 1 for the above tasks
|
||||||
|
- 2 additional tasks from (m,r,w,c) = (0,0,0,0)
|
||||||
|
- 1 additional task from (m,r,w,c) = (1,0,0,0)
|
||||||
|
"""
|
||||||
|
with open(file_path, 'r') as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
# Filter tasks with at least 3 levels
|
||||||
|
valid_tasks = {}
|
||||||
|
for task_name, task_details in data.items():
|
||||||
|
num_levels = len(task_details.get("blueprint", {}).get("levels", []))
|
||||||
|
if num_levels >= 3:
|
||||||
|
valid_tasks[task_name] = task_details
|
||||||
|
|
||||||
|
# print(f"Total valid tasks: {len(valid_tasks)}")
|
||||||
|
|
||||||
|
# Categorize tasks by their (m,r,w,c) values
|
||||||
|
tasks_by_params = {}
|
||||||
|
for task_name, task_details in valid_tasks.items():
|
||||||
|
m, r, w, c = extract_difficulty(task_name)
|
||||||
|
key = (m, r, w, c)
|
||||||
|
if key not in tasks_by_params:
|
||||||
|
tasks_by_params[key] = []
|
||||||
|
tasks_by_params[key].append((task_name, task_details))
|
||||||
|
|
||||||
|
# # Print available combinations
|
||||||
|
# print("Available (m,r,w,c) combinations:")
|
||||||
|
# for params, tasks in tasks_by_params.items():
|
||||||
|
# print(f" {params}: {len(tasks)} tasks")
|
||||||
|
|
||||||
|
# Sample tasks according to the distribution
|
||||||
|
sampled_tasks = {}
|
||||||
|
already_sampled = set()
|
||||||
|
|
||||||
|
# 1. Sample 3 tasks for each (m,r) where 0 <= m <= 2 and 0 <= r <= 2
|
||||||
|
for m in range(3):
|
||||||
|
for r in range(3):
|
||||||
|
# Find all tasks with the current (m,r) and w,c between 0 and 1
|
||||||
|
candidates = []
|
||||||
|
for params, tasks in tasks_by_params.items():
|
||||||
|
if params[0] == m and params[1] == r and params[2] <= 1 and params[3] <= 1:
|
||||||
|
candidates.extend(tasks)
|
||||||
|
|
||||||
|
# Sample 3 tasks if possible
|
||||||
|
if len(candidates) >= 3:
|
||||||
|
sampled = random.sample(candidates, 3)
|
||||||
|
for task_name, task_details in sampled:
|
||||||
|
if task_name not in already_sampled:
|
||||||
|
sampled_tasks[task_name] = task_details
|
||||||
|
already_sampled.add(task_name)
|
||||||
|
else:
|
||||||
|
print(f"Warning: Not enough tasks for (m={m}, r={r}) with w,c <= 1. Found {len(candidates)}.")
|
||||||
|
# Add all available
|
||||||
|
for task_name, task_details in candidates:
|
||||||
|
if task_name not in already_sampled:
|
||||||
|
sampled_tasks[task_name] = task_details
|
||||||
|
already_sampled.add(task_name)
|
||||||
|
|
||||||
|
# 2. Add 2 tasks with (m,r,w,c) = (0,0,0,0)
|
||||||
|
zero_zero_zero_zero = tasks_by_params.get((0,0,0,0), [])
|
||||||
|
zero_zero_zero_zero = [t for t in zero_zero_zero_zero if t[0] not in already_sampled]
|
||||||
|
|
||||||
|
if len(zero_zero_zero_zero) >= 2:
|
||||||
|
additional = random.sample(zero_zero_zero_zero, 2)
|
||||||
|
for task_name, task_details in additional:
|
||||||
|
sampled_tasks[task_name] = task_details
|
||||||
|
already_sampled.add(task_name)
|
||||||
|
else:
|
||||||
|
print(f"Warning: Not enough tasks for (0,0,0,0). Found {len(zero_zero_zero_zero)}.")
|
||||||
|
for task_name, task_details in zero_zero_zero_zero:
|
||||||
|
sampled_tasks[task_name] = task_details
|
||||||
|
already_sampled.add(task_name)
|
||||||
|
|
||||||
|
# 3. Add 1 task with (m,r,w,c) = (1,0,0,0)
|
||||||
|
one_zero_zero_zero = tasks_by_params.get((1,0,0,0), [])
|
||||||
|
one_zero_zero_zero = [t for t in one_zero_zero_zero if t[0] not in already_sampled]
|
||||||
|
|
||||||
|
if len(one_zero_zero_zero) >= 1:
|
||||||
|
additional = random.sample(one_zero_zero_zero, 1)
|
||||||
|
for task_name, task_details in additional:
|
||||||
|
sampled_tasks[task_name] = task_details
|
||||||
|
already_sampled.add(task_name)
|
||||||
|
else:
|
||||||
|
print(f"Warning: Not enough tasks for (1,0,0,0). Found {len(one_zero_zero_zero)}.")
|
||||||
|
for task_name, task_details in one_zero_zero_zero:
|
||||||
|
sampled_tasks[task_name] = task_details
|
||||||
|
already_sampled.add(task_name)
|
||||||
|
|
||||||
|
# Print summary of sampled tasks
|
||||||
|
print(f"\nTotal sampled tasks: {len(sampled_tasks)}")
|
||||||
|
|
||||||
|
# Count tasks by their (m,r) values
|
||||||
|
distribution = {}
|
||||||
|
for task_name in sampled_tasks:
|
||||||
|
m, r, w, c = extract_difficulty(task_name)
|
||||||
|
key = (m, r)
|
||||||
|
if key not in distribution:
|
||||||
|
distribution[key] = []
|
||||||
|
distribution[key].append((w, c))
|
||||||
|
|
||||||
|
print("\nDistribution of sampled tasks:")
|
||||||
|
for mr, wc_list in distribution.items():
|
||||||
|
print(f" (m={mr[0]}, r={mr[1]}): {len(wc_list)} tasks")
|
||||||
|
for wc in wc_list:
|
||||||
|
print(f" (w={wc[0]}, c={wc[1]})")
|
||||||
|
|
||||||
|
# Check for duplicates in sampled tasks
|
||||||
|
if len(sampled_tasks) != len(set(sampled_tasks.keys())):
|
||||||
|
print("\nWARNING: Duplicate tasks detected!")
|
||||||
|
|
||||||
|
# Find the duplicates
|
||||||
|
task_counts = {}
|
||||||
|
for task_name in sampled_tasks.keys():
|
||||||
|
task_counts[task_name] = task_counts.get(task_name, 0) + 1
|
||||||
|
|
||||||
|
duplicates = [task for task, count in task_counts.items() if count > 1]
|
||||||
|
print(f"Duplicate tasks: {duplicates}")
|
||||||
|
else:
|
||||||
|
print("\nVerification: No duplicates found in the sampled tasks.")
|
||||||
|
|
||||||
|
# Save to output file
|
||||||
|
with open(output_path, 'w') as f:
|
||||||
|
json.dump(sampled_tasks, f, indent=4)
|
||||||
|
|
||||||
|
print(f"\nSaved {len(sampled_tasks)} distributed tasks to {output_path}")
|
||||||
|
|
||||||
|
# Example usage:
|
||||||
|
# process_json('test/2agents.json', 'test/2_agents_easiest_tasks.json', alpha=1.0, beta=3.0)
|
||||||
|
# Iterate through files in tasks folder
|
||||||
|
tasks_dir = 'test'
|
||||||
|
for filename in os.listdir(tasks_dir):
|
||||||
|
if filename.endswith('agents.json'):
|
||||||
|
input_path = os.path.join(tasks_dir, filename)
|
||||||
|
# Create output filename by replacing .json with _distributed_tasks.json
|
||||||
|
output_filename = filename.replace('.json', '_distributed_tasks.json')
|
||||||
|
output_path = os.path.join(tasks_dir, output_filename)
|
||||||
|
print(f"\nProcessing {filename}...")
|
||||||
|
sample_tasks_with_distribution(input_path, output_path)
|
87
tasks/construction_tasks/filter_training_tasks.py
Normal file
87
tasks/construction_tasks/filter_training_tasks.py
Normal file
|
@ -0,0 +1,87 @@
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import random
|
||||||
|
import os
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
def extract_difficulty(task_name):
|
||||||
|
"""Extract difficulty parameters from the task name."""
|
||||||
|
match = re.search(r'materials_(\d+)_rooms_(\d+)_window_(\d+)_carpet_(\d+)_variant_\d+', task_name)
|
||||||
|
if match:
|
||||||
|
return tuple(map(int, match.groups())) # (m, r, w, c)
|
||||||
|
return (0, 0, 0, 0) # Default if not found
|
||||||
|
|
||||||
|
def filter_and_sample_tasks(file_path, output_path):
|
||||||
|
"""Filters, samples, and saves 500 unique tasks based on given criteria."""
|
||||||
|
with open(file_path, 'r') as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
total_tasks = len(data)
|
||||||
|
print(f"\nProcessing file: {file_path}")
|
||||||
|
print(f"Total available tasks: {total_tasks}")
|
||||||
|
|
||||||
|
valid_tasks = {}
|
||||||
|
|
||||||
|
# Filter tasks with at least 3 levels
|
||||||
|
for task_name, task_details in data.items():
|
||||||
|
num_levels = len(task_details.get("blueprint", {}).get("levels", []))
|
||||||
|
if num_levels >= 3:
|
||||||
|
valid_tasks[task_name] = task_details
|
||||||
|
|
||||||
|
print(f"Tasks with at least 3 levels: {len(valid_tasks)}")
|
||||||
|
|
||||||
|
# Organize tasks by difficulty parameters (m, r, w, c)
|
||||||
|
tasks_by_params = defaultdict(list)
|
||||||
|
for task_name, task_details in valid_tasks.items():
|
||||||
|
key = extract_difficulty(task_name)
|
||||||
|
tasks_by_params[key].append((task_name, task_details))
|
||||||
|
|
||||||
|
# Sort keys in increasing order
|
||||||
|
sorted_keys = sorted(tasks_by_params.keys())
|
||||||
|
sampled_tasks = {}
|
||||||
|
total_selected = 0
|
||||||
|
sampled_task_counts = defaultdict(int)
|
||||||
|
|
||||||
|
# Pick tasks sequentially until 500 are collected
|
||||||
|
for key in sorted_keys:
|
||||||
|
if total_selected >= 500:
|
||||||
|
break
|
||||||
|
|
||||||
|
if key in tasks_by_params:
|
||||||
|
candidates = tasks_by_params[key]
|
||||||
|
for task_name, task_details in candidates:
|
||||||
|
if total_selected < 500:
|
||||||
|
sampled_tasks[task_name] = task_details
|
||||||
|
sampled_task_counts[key] += 1 # Keep the key as a tuple
|
||||||
|
total_selected += 1
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
print(f"\nTotal sampled tasks: {len(sampled_tasks)}")
|
||||||
|
|
||||||
|
# Print task count per (m, r, w, c) tuple
|
||||||
|
print("\nTask count per (m, r, w, c):")
|
||||||
|
for key, count in sorted(sampled_task_counts.items()):
|
||||||
|
print(f"{key}: {count}")
|
||||||
|
|
||||||
|
# Randomly shuffle the tasks before saving
|
||||||
|
shuffled_tasks = list(sampled_tasks.items())
|
||||||
|
random.shuffle(shuffled_tasks)
|
||||||
|
final_tasks = dict(shuffled_tasks)
|
||||||
|
|
||||||
|
# Save sampled tasks to JSON
|
||||||
|
with open(output_path, 'w') as f:
|
||||||
|
json.dump(final_tasks, f, indent=4)
|
||||||
|
|
||||||
|
print(f"\nSaved {len(final_tasks)} tasks to {output_path}")
|
||||||
|
|
||||||
|
# Process all relevant files
|
||||||
|
tasks_dir = 'train'
|
||||||
|
all_filenames = [f for f in os.listdir(tasks_dir) if f.endswith('agents.json')]
|
||||||
|
all_filenames.sort()
|
||||||
|
|
||||||
|
for i, filename in enumerate(all_filenames):
|
||||||
|
input_path = os.path.join(tasks_dir, filename)
|
||||||
|
output_filename = filename.replace('.json', '_sampled_tasks_for_training.json')
|
||||||
|
output_path = os.path.join(tasks_dir, output_filename)
|
||||||
|
filter_and_sample_tasks(input_path, output_path)
|
Loading…
Add table
Reference in a new issue