mirror of
https://github.com/kolbytn/mindcraft.git
synced 2025-07-28 10:55:27 +02:00

- Added new evaluation.py with dynamic agent configuration support - Implemented comprehensive test suite (38 tests, 100% pass rate) - Enhanced evaluation_script.py with improved error handling and logging - Updated analysis tools for better outcome reporting and visualization - Added extensive documentation including architecture guide and user manuals - Maintained backward compatibility with existing task formats - Improved performance and reliability for multi-agent evaluations Key improvements: - Flexible agent count configuration (1-N agents) - Rich outcome data structures with detailed metrics - Comprehensive error handling and recovery mechanisms - Enhanced logging and debugging capabilities - Complete test coverage for production readiness Files added/modified: - tasks/evaluation.py (new core evaluation engine) - tasks/test_*.py (comprehensive test suite) - docs/ (complete documentation suite) - Updated analysis and visualization tools
361 lines
No EOL
15 KiB
Python
361 lines
No EOL
15 KiB
Python
import unittest
|
|
import os
|
|
import json
|
|
import tempfile
|
|
import shutil
|
|
import pandas as pd
|
|
from unittest.mock import patch
|
|
|
|
from tasks.evaluation import (
|
|
CompletionStatus,
|
|
extract_task_outcome,
|
|
aggregate_results_to_dataframe,
|
|
)
|
|
from tasks.evaluation_script import aggregate_results
|
|
|
|
|
|
class TestRegressionCompatibility(unittest.TestCase):
|
|
"""
|
|
Regression tests to ensure the new evaluation system maintains backward
|
|
compatibility with legacy data formats and logic.
|
|
"""
|
|
|
|
def setUp(self):
|
|
"""Set up a temporary directory for test data."""
|
|
self.test_dir = tempfile.mkdtemp()
|
|
self.exp_dir = os.path.join(self.test_dir, "experiments")
|
|
os.makedirs(self.exp_dir, exist_ok=True)
|
|
|
|
def tearDown(self):
|
|
"""Clean up the temporary directory."""
|
|
shutil.rmtree(self.test_dir)
|
|
|
|
def create_legacy_compatible_data(self):
|
|
"""
|
|
Creates a mock experiment directory with log files that mimic the
|
|
output patterns and scoring of the legacy system.
|
|
"""
|
|
# Task definitions matching legacy format
|
|
task_definitions = {
|
|
"multiagent_cooking_1_cooked_chicken_1_golden_carrot": {
|
|
"task_id": "multiagent_cooking_1_cooked_chicken_1_golden_carrot",
|
|
"type": "cooking",
|
|
"agent_count": 2,
|
|
"task_type": "cooking",
|
|
"difficulty_metrics": {
|
|
"total_recipe_steps": 4,
|
|
"unique_target_items": 2
|
|
}
|
|
},
|
|
"multiagent_crafting_1_wooden_sword": {
|
|
"task_id": "multiagent_crafting_1_wooden_sword",
|
|
"type": "crafting",
|
|
"agent_count": 2,
|
|
"task_type": "crafting",
|
|
"difficulty_metrics": {
|
|
"total_steps": 3,
|
|
"required_tools": 1
|
|
}
|
|
},
|
|
"construction_small_house": {
|
|
"task_id": "construction_small_house",
|
|
"type": "construction",
|
|
"agent_count": 1,
|
|
"task_type": "construction",
|
|
"difficulty_metrics": {
|
|
"blueprint_size": 25,
|
|
"required_blocks": 15
|
|
}
|
|
}
|
|
}
|
|
|
|
# Create folder structure: model/task_id/
|
|
model_dir = os.path.join(self.exp_dir, "claude-3-5-sonnet-latest")
|
|
os.makedirs(model_dir, exist_ok=True)
|
|
|
|
task_folders = []
|
|
|
|
# Successful cooking task (legacy: both agents succeed)
|
|
cooking_dir = os.path.join(model_dir, "multiagent_cooking_1_cooked_chicken_1_golden_carrot")
|
|
os.makedirs(cooking_dir, exist_ok=True)
|
|
task_folders.append(cooking_dir)
|
|
|
|
for i in range(2):
|
|
agent_log = [
|
|
{"role": "user", "content": "Starting cooking task"},
|
|
{"role": "assistant", "content": "I will cook the required items"},
|
|
{"role": "system", "content": "Task ended with score : 1"}
|
|
]
|
|
with open(os.path.join(cooking_dir, f"agent_{i}.json"), "w") as f:
|
|
json.dump(agent_log, f)
|
|
|
|
# Failed crafting task (legacy: one agent fails, one succeeds - overall should be success)
|
|
crafting_dir = os.path.join(model_dir, "multiagent_crafting_1_wooden_sword")
|
|
os.makedirs(crafting_dir, exist_ok=True)
|
|
task_folders.append(crafting_dir)
|
|
|
|
# Agent 0: Success
|
|
agent_log = [
|
|
{"role": "system", "content": "Task ended with score : 1"}
|
|
]
|
|
with open(os.path.join(crafting_dir, "agent_0.json"), "w") as f:
|
|
json.dump(agent_log, f)
|
|
|
|
# Agent 1: Failure
|
|
agent_log = [
|
|
{"role": "system", "content": "Task ended with score : 0"}
|
|
]
|
|
with open(os.path.join(crafting_dir, "agent_1.json"), "w") as f:
|
|
json.dump(agent_log, f)
|
|
|
|
# Construction task with partial score (legacy: should be partial success)
|
|
construction_dir = os.path.join(model_dir, "construction_small_house")
|
|
os.makedirs(construction_dir, exist_ok=True)
|
|
task_folders.append(construction_dir)
|
|
|
|
agent_log = [
|
|
{"role": "system", "content": "Task ended with score : 0.6"}
|
|
]
|
|
with open(os.path.join(construction_dir, "agent_0.json"), "w") as f:
|
|
json.dump(agent_log, f)
|
|
|
|
return task_folders, task_definitions
|
|
|
|
def test_success_rate_calculation_compatibility(self):
|
|
"""
|
|
Tests that the success rate calculation aligns with legacy expectations,
|
|
where any agent scoring 1.0 marks the task as successful.
|
|
"""
|
|
task_folders, task_definitions = self.create_legacy_compatible_data()
|
|
|
|
# Run new system
|
|
results_df = aggregate_results(task_folders, task_definitions)
|
|
|
|
# Legacy expectations:
|
|
# - Cooking: SUCCESS (both agents scored 1.0)
|
|
# - Crafting: SUCCESS (any agent scored 1.0)
|
|
# - Construction: FAILED (score < 1.0, but > 0)
|
|
|
|
cooking_result = results_df[results_df['task_id'].str.contains('cooking')].iloc[0]
|
|
self.assertTrue(cooking_result['overall_is_successful'])
|
|
self.assertEqual(cooking_result['overall_raw_score'], 1.0)
|
|
|
|
crafting_result = results_df[results_df['task_id'].str.contains('crafting')].iloc[0]
|
|
self.assertTrue(crafting_result['overall_is_successful']) # Any agent success = overall success
|
|
self.assertEqual(crafting_result['overall_raw_score'], 1.0)
|
|
|
|
construction_result = results_df[results_df['task_id'].str.contains('construction')].iloc[0]
|
|
self.assertFalse(construction_result['overall_is_successful']) # < 1.0 = not successful
|
|
self.assertEqual(construction_result['overall_raw_score'], 0.6)
|
|
|
|
def test_agent_count_flexibility(self):
|
|
"""
|
|
Tests that the system correctly handles tasks with a variable number of
|
|
agents, a scenario the legacy system may have handled rigidly.
|
|
"""
|
|
task_definitions = {
|
|
"single_agent_task": {
|
|
"task_id": "single_agent_task",
|
|
"type": "crafting",
|
|
"agent_count": 1,
|
|
"task_type": "crafting"
|
|
},
|
|
"triple_agent_task": {
|
|
"task_id": "triple_agent_task",
|
|
"type": "cooking",
|
|
"agent_count": 3,
|
|
"task_type": "cooking"
|
|
},
|
|
"five_agent_task": {
|
|
"task_id": "five_agent_task",
|
|
"type": "construction",
|
|
"agent_count": 5,
|
|
"task_type": "construction"
|
|
}
|
|
}
|
|
|
|
model_dir = os.path.join(self.exp_dir, "test_model")
|
|
os.makedirs(model_dir, exist_ok=True)
|
|
|
|
task_folders = []
|
|
|
|
# Single agent task
|
|
single_dir = os.path.join(model_dir, "single_agent_task")
|
|
os.makedirs(single_dir, exist_ok=True)
|
|
task_folders.append(single_dir)
|
|
|
|
agent_log = [{"role": "system", "content": "Task ended with score : 1"}]
|
|
with open(os.path.join(single_dir, "agent_0.json"), "w") as f:
|
|
json.dump(agent_log, f)
|
|
|
|
# Triple agent task
|
|
triple_dir = os.path.join(model_dir, "triple_agent_task")
|
|
os.makedirs(triple_dir, exist_ok=True)
|
|
task_folders.append(triple_dir)
|
|
|
|
for i in range(3):
|
|
agent_log = [{"role": "system", "content": f"Task ended with score : {0.5 if i == 0 else 1}"}]
|
|
with open(os.path.join(triple_dir, f"agent_{i}.json"), "w") as f:
|
|
json.dump(agent_log, f)
|
|
|
|
# Five agent task
|
|
five_dir = os.path.join(model_dir, "five_agent_task")
|
|
os.makedirs(five_dir, exist_ok=True)
|
|
task_folders.append(five_dir)
|
|
|
|
for i in range(5):
|
|
agent_log = [{"role": "system", "content": f"Task ended with score : {0 if i < 2 else 0.8}"}]
|
|
with open(os.path.join(five_dir, f"agent_{i}.json"), "w") as f:
|
|
json.dump(agent_log, f)
|
|
|
|
# Test that new system handles all agent counts without errors
|
|
results_df = aggregate_results(task_folders, task_definitions)
|
|
|
|
self.assertEqual(len(results_df), 3)
|
|
|
|
# Verify agent counts are correct
|
|
single_result = results_df[results_df['task_id'] == 'single_agent_task'].iloc[0]
|
|
self.assertEqual(single_result['total_agent_logs_found'], 1)
|
|
self.assertTrue(single_result['overall_is_successful'])
|
|
|
|
triple_result = results_df[results_df['task_id'] == 'triple_agent_task'].iloc[0]
|
|
self.assertEqual(triple_result['total_agent_logs_found'], 3)
|
|
self.assertTrue(triple_result['overall_is_successful']) # Any agent succeeded
|
|
|
|
five_result = results_df[results_df['task_id'] == 'five_agent_task'].iloc[0]
|
|
self.assertEqual(five_result['total_agent_logs_found'], 5)
|
|
self.assertFalse(five_result['overall_is_successful']) # Max score 0.8 < 1.0
|
|
|
|
def test_timeout_handling_consistency(self):
|
|
"""
|
|
Tests that timeout messages are handled consistently and that a timeout
|
|
in any agent log correctly marks the entire task as timed out.
|
|
"""
|
|
task_definitions = {
|
|
"timeout_task": {
|
|
"task_id": "timeout_task",
|
|
"type": "cooking",
|
|
"agent_count": 2,
|
|
"task_type": "cooking"
|
|
},
|
|
"mixed_timeout_task": {
|
|
"task_id": "mixed_timeout_task",
|
|
"type": "crafting",
|
|
"agent_count": 2,
|
|
"task_type": "crafting"
|
|
}
|
|
}
|
|
|
|
model_dir = os.path.join(self.exp_dir, "timeout_model")
|
|
os.makedirs(model_dir, exist_ok=True)
|
|
|
|
# Pure timeout task
|
|
timeout_dir = os.path.join(model_dir, "timeout_task")
|
|
os.makedirs(timeout_dir, exist_ok=True)
|
|
|
|
for i in range(2):
|
|
agent_log = [
|
|
{"role": "user", "content": "Starting task"},
|
|
{"role": "system", "content": "Task timeout reached"}
|
|
]
|
|
with open(os.path.join(timeout_dir, f"agent_{i}.json"), "w") as f:
|
|
json.dump(agent_log, f)
|
|
|
|
# Mixed: one timeout, one success
|
|
mixed_dir = os.path.join(model_dir, "mixed_timeout_task")
|
|
os.makedirs(mixed_dir, exist_ok=True)
|
|
|
|
# Agent 0: timeout
|
|
agent_log = [{"role": "system", "content": "Task timeout reached"}]
|
|
with open(os.path.join(mixed_dir, "agent_0.json"), "w") as f:
|
|
json.dump(agent_log, f)
|
|
|
|
# Agent 1: success
|
|
agent_log = [{"role": "system", "content": "Task ended with score : 1"}]
|
|
with open(os.path.join(mixed_dir, "agent_1.json"), "w") as f:
|
|
json.dump(agent_log, f)
|
|
|
|
task_folders = [timeout_dir, mixed_dir]
|
|
results_df = aggregate_results(task_folders, task_definitions)
|
|
|
|
# Pure timeout should be TIMED_OUT
|
|
timeout_result = results_df[results_df['task_id'] == 'timeout_task'].iloc[0]
|
|
self.assertEqual(timeout_result['overall_completion_status'], CompletionStatus.TIMED_OUT)
|
|
self.assertFalse(timeout_result['overall_is_successful'])
|
|
|
|
# Mixed should prioritize timeout over success (as per architecture)
|
|
mixed_result = results_df[results_df['task_id'] == 'mixed_timeout_task'].iloc[0]
|
|
self.assertEqual(mixed_result['overall_completion_status'], CompletionStatus.TIMED_OUT)
|
|
self.assertFalse(mixed_result['overall_is_successful'])
|
|
|
|
def test_dataframe_output_format_compatibility(self):
|
|
"""
|
|
Tests that the output DataFrame contains all the essential columns with
|
|
the correct data types, ensuring compatibility with downstream analysis tools.
|
|
"""
|
|
task_folders, task_definitions = self.create_legacy_compatible_data()
|
|
results_df = aggregate_results(task_folders, task_definitions)
|
|
|
|
# Essential columns that downstream tools expect
|
|
expected_columns = [
|
|
'task_id',
|
|
'model_name',
|
|
'agent_count',
|
|
'task_type',
|
|
'overall_raw_score',
|
|
'overall_is_successful',
|
|
'overall_completion_status',
|
|
'total_agent_logs_found'
|
|
]
|
|
|
|
for col in expected_columns:
|
|
self.assertIn(col, results_df.columns, f"Missing expected column: {col}")
|
|
|
|
# Check data types are appropriate
|
|
self.assertTrue(results_df['overall_raw_score'].dtype in ['float64', 'float32'])
|
|
self.assertTrue(results_df['overall_is_successful'].dtype == 'bool')
|
|
self.assertTrue(results_df['agent_count'].dtype in ['int64', 'int32'])
|
|
|
|
# Check for any NaN values in critical columns
|
|
critical_columns = ['task_id', 'overall_raw_score', 'overall_is_successful']
|
|
for col in critical_columns:
|
|
self.assertFalse(results_df[col].isna().any(), f"Found NaN values in {col}")
|
|
|
|
def test_score_aggregation_logic_consistency(self):
|
|
"""
|
|
Tests that the overall task score is correctly aggregated as the maximum
|
|
score achieved by any single agent in the task.
|
|
"""
|
|
task_definitions = {
|
|
"max_score_test": {
|
|
"task_id": "max_score_test",
|
|
"type": "cooking",
|
|
"agent_count": 3,
|
|
"task_type": "cooking"
|
|
}
|
|
}
|
|
|
|
model_dir = os.path.join(self.exp_dir, "score_test")
|
|
os.makedirs(model_dir, exist_ok=True)
|
|
|
|
# Test that max score is taken across agents
|
|
test_dir = os.path.join(model_dir, "max_score_test")
|
|
os.makedirs(test_dir, exist_ok=True)
|
|
|
|
scores = [0.3, 0.8, 0.5]
|
|
for i, score in enumerate(scores):
|
|
agent_log = [{"role": "system", "content": f"Task ended with score : {score}"}]
|
|
with open(os.path.join(test_dir, f"agent_{i}.json"), "w") as f:
|
|
json.dump(agent_log, f)
|
|
|
|
results_df = aggregate_results([test_dir], task_definitions)
|
|
result = results_df.iloc[0]
|
|
|
|
# Should take maximum score (0.8)
|
|
self.assertEqual(result['overall_raw_score'], 0.8)
|
|
self.assertFalse(result['overall_is_successful']) # < 1.0
|
|
self.assertEqual(result['overall_completion_status'], CompletionStatus.FAILED_PARTIAL_SCORE)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
unittest.main() |