mirror of
https://github.com/kolbytn/mindcraft.git
synced 2025-08-09 08:45:33 +02:00
Thoroughly fixed evaluation_script success calculations and added support for debugging the same
This commit is contained in:
parent
c583e2d5e1
commit
e8e8212832
1 changed files with 69 additions and 4 deletions
|
@ -57,7 +57,6 @@ def analyze_json_file(file_path):
|
|||
for turn in data["turns"]:
|
||||
if turn.get("role") == "system" and "content" in turn:
|
||||
if isinstance(turn["content"], str) and "Task ended with score : " in turn["content"]:
|
||||
score_found = True
|
||||
if "Task ended with score : 1" in turn["content"]:
|
||||
return 1
|
||||
elif "Task ended with score : 0" in turn["content"]:
|
||||
|
@ -66,7 +65,8 @@ def analyze_json_file(file_path):
|
|||
score = float(turn["content"].split(":")[-1].strip())
|
||||
return score
|
||||
|
||||
return False
|
||||
|
||||
return None
|
||||
except FileNotFoundError:
|
||||
print(f"Error: File not found: {file_path}")
|
||||
return None
|
||||
|
@ -86,11 +86,14 @@ def extract_result(folder_path):
|
|||
return None
|
||||
else:
|
||||
score = None
|
||||
curr_score = 0
|
||||
for json_file in json_files:
|
||||
score = analyze_json_file(json_file)
|
||||
if score is not None:
|
||||
return score
|
||||
return 0
|
||||
max_score = max(score, curr_score)
|
||||
curr_score = max_score
|
||||
|
||||
return curr_score
|
||||
|
||||
def aggregate_results(local_folders):
|
||||
"""
|
||||
|
@ -106,22 +109,77 @@ def aggregate_results(local_folders):
|
|||
|
||||
total = 0
|
||||
successful = 0
|
||||
successful_tasks = []
|
||||
for folder_path in tqdm(local_folders):
|
||||
folder_name = os.path.basename(folder_path)
|
||||
|
||||
try:
|
||||
result = extract_result(folder_path)
|
||||
|
||||
if result == 1:
|
||||
successful_tasks.append(folder_name)
|
||||
if result is not None:
|
||||
total += 1
|
||||
successful += result
|
||||
except Exception as e:
|
||||
print(f"Error processing {folder_name}: {e}")
|
||||
|
||||
successful_tasks.sort()
|
||||
|
||||
for i in successful_tasks:
|
||||
print(f"Successful task: {i}")
|
||||
|
||||
return {
|
||||
"total": total,
|
||||
"successful": successful,
|
||||
}
|
||||
|
||||
def check_folder_results(folder_path):
|
||||
"""
|
||||
Evaluate all JSON files in a folder and its subfolders and calculate success metrics.
|
||||
|
||||
Args:
|
||||
folder_path (str): Path to the folder containing JSON log files.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary with success metrics.
|
||||
"""
|
||||
print(f"Checking results in folder: {folder_path}")
|
||||
|
||||
# Check if the folder exists
|
||||
if not os.path.exists(folder_path):
|
||||
print(f"Error: Folder not found: {folder_path}")
|
||||
return None
|
||||
|
||||
# Find all subfolders (task IDs) in the given folder
|
||||
if os.path.isdir(folder_path):
|
||||
subfolders = [f for f in glob.glob(os.path.join(folder_path, "*")) if os.path.isdir(f)]
|
||||
if subfolders:
|
||||
# If there are subfolders, evaluate each subfolder
|
||||
print(f"Found {len(subfolders)} subfolders to evaluate")
|
||||
results = aggregate_results(subfolders)
|
||||
else:
|
||||
# If no subfolders, treat the folder itself as a results folder
|
||||
print("No subfolders found, evaluating the folder itself")
|
||||
results = aggregate_results([folder_path])
|
||||
|
||||
# Calculate success rate
|
||||
if results["total"] > 0:
|
||||
results["success_rate"] = results["successful"] / results["total"]
|
||||
else:
|
||||
results["success_rate"] = 0.0
|
||||
|
||||
# Print summary
|
||||
print("\n=== Evaluation Results ===")
|
||||
print(f"Total tasks evaluated: {results['total']}")
|
||||
print(f"Successful tasks: {results['successful']}")
|
||||
print(f"Success rate: {results['success_rate']:.2%}")
|
||||
|
||||
return results
|
||||
else:
|
||||
print(f"Error: {folder_path} is not a directory")
|
||||
return None
|
||||
|
||||
def read_settings(file_path):
|
||||
"""Read and parse the settings.js file to get agent profiles."""
|
||||
with open(file_path, 'r', encoding='utf-8') as file:
|
||||
|
@ -722,9 +780,16 @@ def main():
|
|||
parser.add_argument('--num_examples', default=2, type=int, help='Maximum number of turns before summarizing')
|
||||
parser.add_argument('--no-pruning', action='store_true', help='Disable pruning of the actions')
|
||||
parser.add_argument('--block_conversation', action='store_true', help='Block conversation actions')
|
||||
parser.add_argument('--check', metavar='FOLDER_PATH', help='Check and evaluate results in the specified folder without running experiments')
|
||||
|
||||
args = parser.parse_args()
|
||||
print(args)
|
||||
|
||||
# If --check flag is provided, evaluate results in the specified folder and exit
|
||||
if args.check:
|
||||
check_folder_results(args.check)
|
||||
return
|
||||
|
||||
if not args.no_launch_world:
|
||||
try:
|
||||
subprocess.run(['tmux', 'kill-server'], check=True)
|
||||
|
|
Loading…
Add table
Reference in a new issue