import unittest import os import json import pandas as pd from unittest.mock import patch, mock_open from tasks.evaluation import ( CompletionStatus, AgentOutcome, TaskRunOutcome, analyze_agent_log, extract_task_outcome, aggregate_results_to_dataframe, ) class TestEvaluation(unittest.TestCase): """Unit tests for the core evaluation logic in evaluation.py.""" def setUp(self): """Set up a temporary directory for log files.""" self.test_dir = "test_logs" os.makedirs(self.test_dir, exist_ok=True) def tearDown(self): """Clean up the temporary directory and its contents.""" for f in os.listdir(self.test_dir): os.remove(os.path.join(self.test_dir, f)) os.rmdir(self.test_dir) def test_analyze_agent_log_success(self): """ Tests analysis of a log file where the agent successfully completes the task. """ log_content = [ {"role": "user", "content": "Start task"}, {"role": "system", "content": "Task ended with score : 1.0"} ] log_path = os.path.join(self.test_dir, "success.json") with open(log_path, "w") as f: json.dump(log_content, f) outcome = analyze_agent_log(log_path) self.assertEqual(outcome.raw_score, 1.0) self.assertEqual(outcome.completion_status, CompletionStatus.SUCCESS) self.assertTrue(outcome.agent_log_processed) def test_analyze_agent_log_timeout(self): """ Tests analysis of a log file where the agent's task times out. """ log_content = [ {"role": "user", "content": "Start task"}, {"role": "system", "content": "Task timeout reached"} ] log_path = os.path.join(self.test_dir, "timeout.json") with open(log_path, "w") as f: json.dump(log_content, f) outcome = analyze_agent_log(log_path) self.assertEqual(outcome.raw_score, 0.0) self.assertEqual(outcome.completion_status, CompletionStatus.TIMED_OUT) self.assertTrue(outcome.timed_out) def test_analyze_agent_log_file_not_found(self): """ Tests that the system handles a non-existent log file gracefully. """ outcome = analyze_agent_log("non_existent_file.json") self.assertEqual(outcome.completion_status, CompletionStatus.LOG_FILE_ERROR) self.assertFalse(outcome.agent_log_processed) def test_analyze_agent_log_json_error(self): """ Tests that the system handles a log file with invalid JSON content. """ log_path = os.path.join(self.test_dir, "error.json") with open(log_path, "w") as f: f.write("invalid json") outcome = analyze_agent_log(log_path) self.assertEqual(outcome.completion_status, CompletionStatus.LOG_FILE_ERROR) self.assertIn("JSONDecodeError", outcome.parsing_errors[0]) def test_extract_task_outcome_multiple_agents(self): """ Tests the aggregation of outcomes from multiple agents for a single task. Ensures that the highest score determines the overall outcome. """ # Agent 1: Success log_content_1 = [{"role": "system", "content": "Task ended with score : 1.0"}] log_path_1 = os.path.join(self.test_dir, "agent1.json") with open(log_path_1, "w") as f: json.dump(log_content_1, f) # Agent 2: Partial Score log_content_2 = [{"role": "system", "content": "Task ended with score : 0.5"}] log_path_2 = os.path.join(self.test_dir, "agent2.json") with open(log_path_2, "w") as f: json.dump(log_content_2, f) task_def = {"task_id": "test_task_1", "agent_count": 2, "task_type": "test", "difficulty_metrics": {"complexity": 5}} outcome = extract_task_outcome(self.test_dir, task_def) self.assertEqual(outcome.overall_raw_score, 1.0) self.assertTrue(outcome.overall_is_successful) self.assertEqual(outcome.overall_completion_status, CompletionStatus.SUCCESS) self.assertEqual(outcome.total_agent_logs_found, 2) def test_aggregate_results_to_dataframe(self): """ Tests the conversion of multiple TaskRunOutcome objects into a Pandas DataFrame. Verifies that the DataFrame is structured correctly and metrics are flattened. """ task_outcomes = [ TaskRunOutcome( task_id="task1", model_name="gpt-4", agent_count=1, task_type="crafting", overall_raw_score=1.0, overall_is_successful=True, overall_completion_status=CompletionStatus.SUCCESS, total_agent_logs_found=1, agent_outcomes=[], task_definition_metrics={"steps": 10, "tools": 2} ), TaskRunOutcome( task_id="task2", model_name="gpt-4", agent_count=2, task_type="cooking", overall_raw_score=0.0, overall_is_successful=False, overall_completion_status=CompletionStatus.TIMED_OUT, total_agent_logs_found=2, agent_outcomes=[], task_definition_metrics={"steps": 20, "tools": 5} ) ] df = aggregate_results_to_dataframe(task_outcomes) self.assertIsInstance(df, pd.DataFrame) self.assertEqual(len(df), 2) self.assertIn("metric_steps", df.columns) self.assertIn("metric_tools", df.columns) self.assertEqual(df.loc[0, "metric_steps"], 10) if __name__ == '__main__': unittest.main()