import os from dataclasses import dataclass, field from enum import Enum from typing import List, Dict, Any import pandas as pd import logging # Set up basic logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') class CompletionStatus(Enum): """Enumeration for the completion status of a task.""" SUCCESS = "SUCCESS" FAILED_SCORE_ZERO = "FAILED_SCORE_ZERO" FAILED_PARTIAL_SCORE = "FAILED_PARTIAL_SCORE" TIMED_OUT = "TIMED_OUT" NO_SCORE_LOGGED = "NO_SCORE_LOGGED" LOG_FILE_ERROR = "LOG_FILE_ERROR" @dataclass class AgentOutcome: """ Holds the outcome of a single agent's task, including score and status. Attributes: raw_score (float): The score extracted from the log file. completion_status (CompletionStatus): The final status of the agent's task. final_system_message (str): The last system message, often containing the score. agent_log_processed (bool): True if the log was successfully processed. parsing_errors (List[str]): A list of errors encountered during parsing. timed_out (bool): True if the agent timed out. """ raw_score: float completion_status: CompletionStatus final_system_message: str agent_log_processed: bool parsing_errors: List[str] = field(default_factory=list) timed_out: bool = False @dataclass class TaskRunOutcome: """ Holds the aggregated outcome of a single task run, including all agents. Attributes: task_id (str): The unique identifier for the task. model_name (str): The name of the model used for the task. agent_count (int): The number of agents participating in the task. task_type (str): The category of the task (e.g., 'cooking', 'crafting'). overall_raw_score (float): The highest score achieved by any agent. overall_is_successful (bool): True if the task was completed successfully. overall_completion_status (CompletionStatus): The final aggregated status of the task. total_agent_logs_found (int): The number of agent log files found. agent_outcomes (List[AgentOutcome]): A list of individual agent outcomes. task_definition_metrics (Dict[str, Any]): Metrics from the task definition file. """ task_id: str model_name: str agent_count: int task_type: str overall_raw_score: float overall_is_successful: bool overall_completion_status: CompletionStatus total_agent_logs_found: int agent_outcomes: List[AgentOutcome] task_definition_metrics: Dict[str, Any] import json import re def analyze_agent_log(file_path: str) -> AgentOutcome: """ Analyzes a single agent's JSON log file to extract key outcomes. This function reads a JSON log file, parses its content to find the final score, timeout status, and other relevant information. It is designed to be robust against file I/O errors and malformed JSON. Args: file_path (str): The full path to the agent's log file. Returns: AgentOutcome: A dataclass containing the analysis results for one agent. """ try: with open(file_path, 'r') as f: log_data = json.load(f) except FileNotFoundError: logging.warning(f"Log file not found: {file_path}") return AgentOutcome( raw_score=0.0, completion_status=CompletionStatus.LOG_FILE_ERROR, final_system_message="", agent_log_processed=False, parsing_errors=["FileNotFoundError"], ) except json.JSONDecodeError as e: logging.error(f"JSON decoding error in {file_path}: {e}") return AgentOutcome( raw_score=0.0, completion_status=CompletionStatus.LOG_FILE_ERROR, final_system_message="", agent_log_processed=False, parsing_errors=[f"JSONDecodeError: {e}"], ) timed_out = False final_system_message = "" raw_score = 0.0 completion_status = CompletionStatus.NO_SCORE_LOGGED for entry in reversed(log_data): if entry.get("role") == "system": content = entry.get("content", "") if "Task timeout reached" in content: timed_out = True final_system_message = content completion_status = CompletionStatus.TIMED_OUT break score_match = re.search(r"Task ended with score : (\d+\.?\d*)", content) if score_match: raw_score = float(score_match.group(1)) final_system_message = content if raw_score == 1.0: completion_status = CompletionStatus.SUCCESS elif raw_score == 0.0: completion_status = CompletionStatus.FAILED_SCORE_ZERO else: completion_status = CompletionStatus.FAILED_PARTIAL_SCORE break return AgentOutcome( raw_score=raw_score, completion_status=completion_status, final_system_message=final_system_message, agent_log_processed=True, timed_out=timed_out, ) import glob def extract_task_outcome(folder_path: str, task_definition: Dict[str, Any]) -> TaskRunOutcome: """ Orchestrates the analysis of a single task run folder by aggregating agent logs. This function scans a given folder for agent log files (*.json), analyzes each one, and then aggregates the results into a single `TaskRunOutcome`. It determines the overall success and status based on the collective performance of all agents. Args: folder_path (str): The path to the folder containing agent logs for a single run. task_definition (Dict[str, Any]): The task definition dictionary, used for metadata. Returns: TaskRunOutcome: A dataclass containing the aggregated results for the task run. """ agent_log_files = glob.glob(os.path.join(folder_path, "*.json")) agent_outcomes = [analyze_agent_log(log_file) for log_file in agent_log_files] if not agent_outcomes: logging.warning(f"No agent logs found in {folder_path} for task {task_definition.get('task_id', '')}") return TaskRunOutcome( task_id=task_definition.get("task_id", ""), model_name="", # Will be populated later agent_count=task_definition.get("agent_count", 0), task_type=task_definition.get("task_type", ""), overall_raw_score=0.0, overall_is_successful=False, overall_completion_status=CompletionStatus.NO_SCORE_LOGGED, total_agent_logs_found=0, agent_outcomes=[], task_definition_metrics=task_definition.get("difficulty_metrics", {}), ) overall_raw_score = max(outcome.raw_score for outcome in agent_outcomes) # If any agent timed out, the whole task is considered timed out. if any(outcome.timed_out for outcome in agent_outcomes): overall_completion_status = CompletionStatus.TIMED_OUT # If any agent succeeded, the task is a success. elif any(outcome.completion_status == CompletionStatus.SUCCESS for outcome in agent_outcomes): overall_completion_status = CompletionStatus.SUCCESS # If all agents have partial scores, the task is partially successful elif all(outcome.completion_status == CompletionStatus.FAILED_PARTIAL_SCORE for outcome in agent_outcomes): overall_completion_status = CompletionStatus.FAILED_PARTIAL_SCORE else: # Fallback to the status of the first agent if no clear success/timeout overall_completion_status = agent_outcomes[0].completion_status overall_is_successful = overall_completion_status == CompletionStatus.SUCCESS return TaskRunOutcome( task_id=task_definition.get("task_id", ""), model_name="", # Will be populated later agent_count=task_definition.get("agent_count", 0), task_type=task_definition.get("task_type", ""), overall_raw_score=overall_raw_score, overall_is_successful=overall_is_successful, overall_completion_status=overall_completion_status, total_agent_logs_found=len(agent_outcomes), agent_outcomes=agent_outcomes, task_definition_metrics=task_definition.get("difficulty_metrics", {}), ) def aggregate_results_to_dataframe(task_outcomes: List[TaskRunOutcome]) -> pd.DataFrame: """ Converts a list of TaskRunOutcome objects into a Pandas DataFrame. This function is a key step in the analysis pipeline, transforming the raw outcome objects into a structured DataFrame suitable for advanced analysis, visualization, and reporting. It flattens nested metric dictionaries for easier access. Args: task_outcomes (List[TaskRunOutcome]): A list of task outcome objects to be aggregated. Returns: pd.DataFrame: A DataFrame where each row represents a single task run. """ if not task_outcomes: return pd.DataFrame() # Convert list of dataclasses to list of dicts outcome_dicts = [vars(outcome) for outcome in task_outcomes] # Create DataFrame df = pd.DataFrame(outcome_dicts) # Flatten the 'task_definition_metrics' dictionary into separate columns if 'task_definition_metrics' in df.columns: metrics_df = df['task_definition_metrics'].apply(pd.Series) metrics_df = metrics_df.add_prefix('metric_') df = pd.concat([df.drop(['task_definition_metrics'], axis=1), metrics_df], axis=1) # The 'agent_outcomes' is a complex object (list of dataclasses). # For now, we'll leave it as is, but it can be flattened further if needed. return df