Thoroughly fixed evaluation_script success calculations and added support for debugging the same

This commit is contained in:
Ayush Maniar 2025-04-17 13:06:19 -07:00
parent c583e2d5e1
commit e8e8212832

View file

@ -57,7 +57,6 @@ def analyze_json_file(file_path):
for turn in data["turns"]: for turn in data["turns"]:
if turn.get("role") == "system" and "content" in turn: if turn.get("role") == "system" and "content" in turn:
if isinstance(turn["content"], str) and "Task ended with score : " in turn["content"]: if isinstance(turn["content"], str) and "Task ended with score : " in turn["content"]:
score_found = True
if "Task ended with score : 1" in turn["content"]: if "Task ended with score : 1" in turn["content"]:
return 1 return 1
elif "Task ended with score : 0" in turn["content"]: elif "Task ended with score : 0" in turn["content"]:
@ -66,7 +65,8 @@ def analyze_json_file(file_path):
score = float(turn["content"].split(":")[-1].strip()) score = float(turn["content"].split(":")[-1].strip())
return score return score
return False
return None
except FileNotFoundError: except FileNotFoundError:
print(f"Error: File not found: {file_path}") print(f"Error: File not found: {file_path}")
return None return None
@ -86,11 +86,14 @@ def extract_result(folder_path):
return None return None
else: else:
score = None score = None
curr_score = 0
for json_file in json_files: for json_file in json_files:
score = analyze_json_file(json_file) score = analyze_json_file(json_file)
if score is not None: if score is not None:
return score max_score = max(score, curr_score)
return 0 curr_score = max_score
return curr_score
def aggregate_results(local_folders): def aggregate_results(local_folders):
""" """
@ -106,22 +109,77 @@ def aggregate_results(local_folders):
total = 0 total = 0
successful = 0 successful = 0
successful_tasks = []
for folder_path in tqdm(local_folders): for folder_path in tqdm(local_folders):
folder_name = os.path.basename(folder_path) folder_name = os.path.basename(folder_path)
try: try:
result = extract_result(folder_path) result = extract_result(folder_path)
if result == 1:
successful_tasks.append(folder_name)
if result is not None: if result is not None:
total += 1 total += 1
successful += result successful += result
except Exception as e: except Exception as e:
print(f"Error processing {folder_name}: {e}") print(f"Error processing {folder_name}: {e}")
successful_tasks.sort()
for i in successful_tasks:
print(f"Successful task: {i}")
return { return {
"total": total, "total": total,
"successful": successful, "successful": successful,
} }
def check_folder_results(folder_path):
"""
Evaluate all JSON files in a folder and its subfolders and calculate success metrics.
Args:
folder_path (str): Path to the folder containing JSON log files.
Returns:
dict: A dictionary with success metrics.
"""
print(f"Checking results in folder: {folder_path}")
# Check if the folder exists
if not os.path.exists(folder_path):
print(f"Error: Folder not found: {folder_path}")
return None
# Find all subfolders (task IDs) in the given folder
if os.path.isdir(folder_path):
subfolders = [f for f in glob.glob(os.path.join(folder_path, "*")) if os.path.isdir(f)]
if subfolders:
# If there are subfolders, evaluate each subfolder
print(f"Found {len(subfolders)} subfolders to evaluate")
results = aggregate_results(subfolders)
else:
# If no subfolders, treat the folder itself as a results folder
print("No subfolders found, evaluating the folder itself")
results = aggregate_results([folder_path])
# Calculate success rate
if results["total"] > 0:
results["success_rate"] = results["successful"] / results["total"]
else:
results["success_rate"] = 0.0
# Print summary
print("\n=== Evaluation Results ===")
print(f"Total tasks evaluated: {results['total']}")
print(f"Successful tasks: {results['successful']}")
print(f"Success rate: {results['success_rate']:.2%}")
return results
else:
print(f"Error: {folder_path} is not a directory")
return None
def read_settings(file_path): def read_settings(file_path):
"""Read and parse the settings.js file to get agent profiles.""" """Read and parse the settings.js file to get agent profiles."""
with open(file_path, 'r', encoding='utf-8') as file: with open(file_path, 'r', encoding='utf-8') as file:
@ -722,9 +780,16 @@ def main():
parser.add_argument('--num_examples', default=2, type=int, help='Maximum number of turns before summarizing') parser.add_argument('--num_examples', default=2, type=int, help='Maximum number of turns before summarizing')
parser.add_argument('--no-pruning', action='store_true', help='Disable pruning of the actions') parser.add_argument('--no-pruning', action='store_true', help='Disable pruning of the actions')
parser.add_argument('--block_conversation', action='store_true', help='Block conversation actions') parser.add_argument('--block_conversation', action='store_true', help='Block conversation actions')
parser.add_argument('--check', metavar='FOLDER_PATH', help='Check and evaluate results in the specified folder without running experiments')
args = parser.parse_args() args = parser.parse_args()
print(args) print(args)
# If --check flag is provided, evaluate results in the specified folder and exit
if args.check:
check_folder_results(args.check)
return
if not args.no_launch_world: if not args.no_launch_world:
try: try:
subprocess.run(['tmux', 'kill-server'], check=True) subprocess.run(['tmux', 'kill-server'], check=True)