Thoroughly fixed evaluation_script success calculations and added support for debugging the same

2025-08-09 16:55:34 +02:00 · 2025-04-17 13:06:19 -07:00 · 2025-04-17 13:06:19 -07:00 · e8e8212832
commit e8e8212832
parent c583e2d5e1
1 changed files with 69 additions and 4 deletions
--- a/evaluation_script.py
+++ b/evaluation_script.py
@ -57,7 +57,6 @@ def analyze_json_file(file_path):
                for turn in data["turns"]:
                    if turn.get("role") == "system" and "content" in turn:
                        if isinstance(turn["content"], str) and "Task ended with score : " in turn["content"]:
                            score_found = True
                            if "Task ended with score : 1" in turn["content"]:
                                return 1
                            elif "Task ended with score : 0" in turn["content"]:
@ -66,7 +65,8 @@ def analyze_json_file(file_path):
                                score = float(turn["content"].split(":")[-1].strip())
                                return score
-        return False
+                            
        return None
    except FileNotFoundError:
        print(f"Error: File not found: {file_path}")
        return None
@ -86,11 +86,14 @@ def extract_result(folder_path):
        return None
    else: 
        score = None
        curr_score = 0
        for json_file in json_files:
            score = analyze_json_file(json_file)
            if score is not None:
-                return score
+                max_score = max(score, curr_score)
-        return 0
+                curr_score = max_score
        return curr_score
 def aggregate_results(local_folders):
    """
@ -106,22 +109,77 @@ def aggregate_results(local_folders):
    total = 0
    successful = 0
    successful_tasks = []
    for folder_path in tqdm(local_folders):
        folder_name = os.path.basename(folder_path)
        try: 
            result = extract_result(folder_path)
            if result == 1:
                successful_tasks.append(folder_name)
            if result is not None:
                total += 1
                successful += result
        except Exception as e:
            print(f"Error processing {folder_name}: {e}")
    successful_tasks.sort()
    for i in successful_tasks:
        print(f"Successful task: {i}")
    return {
        "total": total,
        "successful": successful,
    }
 def check_folder_results(folder_path):
    """
    Evaluate all JSON files in a folder and its subfolders and calculate success metrics.
    Args:
        folder_path (str): Path to the folder containing JSON log files.
    Returns:
        dict: A dictionary with success metrics.
    """
    print(f"Checking results in folder: {folder_path}")
    # Check if the folder exists
    if not os.path.exists(folder_path):
        print(f"Error: Folder not found: {folder_path}")
        return None
    # Find all subfolders (task IDs) in the given folder
    if os.path.isdir(folder_path):
        subfolders = [f for f in glob.glob(os.path.join(folder_path, "*")) if os.path.isdir(f)]
        if subfolders:
            # If there are subfolders, evaluate each subfolder
            print(f"Found {len(subfolders)} subfolders to evaluate")
            results = aggregate_results(subfolders)
        else:
            # If no subfolders, treat the folder itself as a results folder
            print("No subfolders found, evaluating the folder itself")
            results = aggregate_results([folder_path])
        # Calculate success rate
        if results["total"] > 0:
            results["success_rate"] = results["successful"] / results["total"]
        else:
            results["success_rate"] = 0.0
        # Print summary
        print("\n=== Evaluation Results ===")
        print(f"Total tasks evaluated: {results['total']}")
        print(f"Successful tasks: {results['successful']}")
        print(f"Success rate: {results['success_rate']:.2%}")
        return results
    else:
        print(f"Error: {folder_path} is not a directory")
        return None
 def read_settings(file_path):
    """Read and parse the settings.js file to get agent profiles."""
    with open(file_path, 'r', encoding='utf-8') as file:
@ -722,9 +780,16 @@ def main():
    parser.add_argument('--num_examples', default=2, type=int, help='Maximum number of turns before summarizing')
    parser.add_argument('--no-pruning', action='store_true', help='Disable pruning of the actions')
    parser.add_argument('--block_conversation', action='store_true', help='Block conversation actions')
    parser.add_argument('--check', metavar='FOLDER_PATH', help='Check and evaluate results in the specified folder without running experiments')
    args = parser.parse_args()
    print(args)
    # If --check flag is provided, evaluate results in the specified folder and exit
    if args.check:
        check_folder_results(args.check)
        return
    if not args.no_launch_world:
        try: 
            subprocess.run(['tmux', 'kill-server'], check=True)