Thoroughly fixed evaluation_script success calculations and added support for debugging the same

2025-08-09 08:45:33 +02:00 · 2025-04-17 13:06:19 -07:00 · 2025-04-17 13:06:19 -07:00 · e8e8212832
commit e8e8212832
parent c583e2d5e1
1 changed files with 69 additions and 4 deletions
--- a/evaluation_script.py
+++ b/evaluation_script.py
@ -57,7 +57,6 @@ def analyze_json_file(file_path):
                for turn in data["turns"]:
                    if turn.get("role") == "system" and "content" in turn:
                        if isinstance(turn["content"], str) and "Task ended with score : " in turn["content"]:
-                            score_found = True
                            if "Task ended with score : 1" in turn["content"]:
                                return 1
                            elif "Task ended with score : 0" in turn["content"]:
@ -66,7 +65,8 @@ def analyze_json_file(file_path):
                                score = float(turn["content"].split(":")[-1].strip())
                                return score
                            
-        return False
+                            
+        return None
    except FileNotFoundError:
        print(f"Error: File not found: {file_path}")
        return None
@ -86,11 +86,14 @@ def extract_result(folder_path):
        return None
    else: 
        score = None
+        curr_score = 0
        for json_file in json_files:
            score = analyze_json_file(json_file)
            if score is not None:
-                return score
-        return 0
+                max_score = max(score, curr_score)
+                curr_score = max_score
+
+        return curr_score
    
 def aggregate_results(local_folders):
    """
@ -106,22 +109,77 @@ def aggregate_results(local_folders):

    total = 0
    successful = 0
+    successful_tasks = []
    for folder_path in tqdm(local_folders):
        folder_name = os.path.basename(folder_path)

        try: 
            result = extract_result(folder_path)
+            
+            if result == 1:
+                successful_tasks.append(folder_name)
            if result is not None:
                total += 1
                successful += result
        except Exception as e:
            print(f"Error processing {folder_name}: {e}")

+    successful_tasks.sort()
+
+    for i in successful_tasks:
+        print(f"Successful task: {i}")
+    
    return {
        "total": total,
        "successful": successful,
    }

+def check_folder_results(folder_path):
+    """
+    Evaluate all JSON files in a folder and its subfolders and calculate success metrics.
+    
+    Args:
+        folder_path (str): Path to the folder containing JSON log files.
+        
+    Returns:
+        dict: A dictionary with success metrics.
+    """
+    print(f"Checking results in folder: {folder_path}")
+    
+    # Check if the folder exists
+    if not os.path.exists(folder_path):
+        print(f"Error: Folder not found: {folder_path}")
+        return None
+    
+    # Find all subfolders (task IDs) in the given folder
+    if os.path.isdir(folder_path):
+        subfolders = [f for f in glob.glob(os.path.join(folder_path, "*")) if os.path.isdir(f)]
+        if subfolders:
+            # If there are subfolders, evaluate each subfolder
+            print(f"Found {len(subfolders)} subfolders to evaluate")
+            results = aggregate_results(subfolders)
+        else:
+            # If no subfolders, treat the folder itself as a results folder
+            print("No subfolders found, evaluating the folder itself")
+            results = aggregate_results([folder_path])
+            
+        # Calculate success rate
+        if results["total"] > 0:
+            results["success_rate"] = results["successful"] / results["total"]
+        else:
+            results["success_rate"] = 0.0
+            
+        # Print summary
+        print("\n=== Evaluation Results ===")
+        print(f"Total tasks evaluated: {results['total']}")
+        print(f"Successful tasks: {results['successful']}")
+        print(f"Success rate: {results['success_rate']:.2%}")
+        
+        return results
+    else:
+        print(f"Error: {folder_path} is not a directory")
+        return None
+
 def read_settings(file_path):
    """Read and parse the settings.js file to get agent profiles."""
    with open(file_path, 'r', encoding='utf-8') as file:
@ -722,9 +780,16 @@ def main():
    parser.add_argument('--num_examples', default=2, type=int, help='Maximum number of turns before summarizing')
    parser.add_argument('--no-pruning', action='store_true', help='Disable pruning of the actions')
    parser.add_argument('--block_conversation', action='store_true', help='Block conversation actions')
+    parser.add_argument('--check', metavar='FOLDER_PATH', help='Check and evaluate results in the specified folder without running experiments')

    args = parser.parse_args()
    print(args)
+    
+    # If --check flag is provided, evaluate results in the specified folder and exit
+    if args.check:
+        check_folder_results(args.check)
+        return
+    
    if not args.no_launch_world:
        try: 
            subprocess.run(['tmux', 'kill-server'], check=True)