diff --git a/analyse_results.py b/analyse_results.py new file mode 100644 index 0000000..aceaf58 --- /dev/null +++ b/analyse_results.py @@ -0,0 +1,258 @@ +import boto3 +import os +import json +import re +from botocore.exceptions import ClientError +import json +import argparse +from tqdm import tqdm +import glob + +def download_s3_folders(bucket_name, s3_prefix, local_base_dir): + """ + Downloads groups of folders from S3 based on the next level of prefixes. + + Args: + bucket_name (str): Name of the S3 bucket. + s3_prefix (str): Prefix where the folders are located (e.g., 'my-experiments/'). + local_base_dir (str): Local directory to download the folders to. + + Returns: + list: List of downloaded local folder paths. + """ + s3_client = boto3.client('s3') + downloaded_folders = [] + + try: + # List objects with the prefix, delimited by '/' to find sub-prefixes (folders) + response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=s3_prefix, Delimiter='/') + + if 'CommonPrefixes' not in response: + print(f"No folders found under s3://{bucket_name}/{s3_prefix}") + return downloaded_folders + + s3_folder_prefixes = [prefix['Prefix'] for prefix in response['CommonPrefixes']] + subfolder = s3_prefix.split('/')[-2] + + for s3_folder_prefix in tqdm(s3_folder_prefixes): + folder_name = s3_folder_prefix.split('/')[-2] # Extract folder name + local_folder_path = os.path.join(local_base_dir, subfolder, folder_name) + os.makedirs(local_folder_path, exist_ok=True) + downloaded_folders.append(local_folder_path) + + # Download files within the folder + objects_in_folder = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder_prefix) + if 'Contents' in objects_in_folder: + for obj in objects_in_folder['Contents']: + s3_key = obj['Key'] + if s3_key.endswith(('.json')): # Only download json files + local_file_path = os.path.join(local_folder_path, os.path.basename(s3_key)) + try: + s3_client.download_file(bucket_name, s3_key, local_file_path) + except Exception as e: + print(f"Error downloading {s3_key}: {e}") + + else: + print(f"No files found in {s3_folder_prefix}") + + except ClientError as e: + print(f"Error accessing S3: {e}") + return [] + + return downloaded_folders + +def analyze_json_file(file_path): + """ + Analyzes a single JSON file to extract the task outcome. + + Args: + file_path (str): Path to the JSON file. + + Returns: + str or None: The task outcome string if found, otherwise None. + """ + try: + with open(file_path, 'r') as f: + data = json.load(f) + if 'turns' in data and isinstance(data['turns'], list): + for turn in reversed(data['turns']): # Check turns from the end + if turn.get('role') == 'system' and isinstance(turn.get('content'), str): + if "Task successful ended with code : 2" in turn['content']: + return True + return False + except FileNotFoundError: + print(f"Error: File not found: {file_path}") + return None + except json.JSONDecodeError: + print(f"Error: Invalid JSON format in: {file_path}") + return None + except Exception as e: + print(f"An unexpected error occurred while processing {file_path}: {e}") + return None + +def extract_result(folder_path): + folder_name = os.path.basename(folder_path) + json_files = glob.glob(os.path.join(folder_path, "*.json")) + assert len(json_files) == 2, f"Expected 2 json files in {folder_name}, found {len(json_files)}" + + if not json_files: + print(f"No JSON files found in {folder_name}") + return None + else: + outcome = False + for json_file in json_files: + outcome = analyze_json_file(json_file) + if outcome: + return True + return False + +def is_base(folder_path): + return "full_plan" in folder_path and "depth_0" in folder_path and "missing" not in folder_path + +def base_without_plan(folder_path): + return "no_plan" in folder_path and "depth_0" in folder_path and "missing" in folder_path + +def aggregate_results(local_folders): + """ + Aggregates the analysis results for each folder. + + Args: + local_folders (list): List of local folder paths containing the JSON files. + + Returns: + dict: A dictionary where keys are folder names and values are the aggregated outcomes. + """ + aggregated_data = {} + + total = 0 + successful = 0 + + base_successful = 0 + base_total = 0 + + base_no_plan_successful = 0 + base_no_plan_total = 0 + + missing_successful = 0 + missing_total = 0 + + full_plan_successful = 0 + full_plan_total = 0 + + partial_plan_successful = 0 + partial_plan_total = 0 + + no_plan_successful = 0 + no_plan_total = 0 + + high_depth_successful = 0 + high_depth_total = 0 + for folder_path in tqdm(local_folders): + folder_name = os.path.basename(folder_path) + + try: + total += 1 + result = extract_result(folder_path) + success = int(extract_result(folder_path)) + successful += success + + if "missing" in folder_path: + missing_successful += success + missing_total += 1 + if is_base(folder_path): + base_successful += success + base_total += 1 + if base_without_plan(folder_path): + base_no_plan_successful += success + base_no_plan_total += 1 + if "full_plan" in folder_path and not is_base(folder_path): + full_plan_successful += success + full_plan_total += 1 + if "partial_plan" in folder_path and not is_base(folder_path): + partial_plan_successful += success + partial_plan_total += 1 + if "no_plan" in folder_path and not is_base(folder_path): + no_plan_successful += success + no_plan_total += 1 + if "depth_1" in folder_path or "depth_2" in folder_path and not is_base(folder_path): + high_depth_successful += success + high_depth_total += 1 + except Exception as e: + print(f"Error processing {folder_name}: {e}") + + return { + "total": total, + "successful": successful, + "success_rate": successful / total if total > 0 else 0, + "base_total": base_total, + "base_successful": base_successful, + "base_success_rate": base_successful / base_total if base_total > 0 else 0, + "base_no_plan_total": base_no_plan_total, + "base_no_plan_successful": base_no_plan_successful, + "base_no_plan_success_rate": base_no_plan_successful / base_no_plan_total if base_no_plan_total > 0 else 0, + "missing_total": missing_total, + "missing_successful": missing_successful, + "missing_success_rate": missing_successful / missing_total if missing_total > 0 else 0, + "full_plan_total": full_plan_total, + "full_plan_successful": full_plan_successful, + "full_plan_success_rate": full_plan_successful / full_plan_total if full_plan_total > 0 else 0, + "partial_plan_total": partial_plan_total, + "partial_plan_successful": partial_plan_successful, + "partial_plan_success_rate": partial_plan_successful / partial_plan_total if partial_plan_total > 0 else 0, + "no_plan_total": no_plan_total, + "no_plan_successful": no_plan_successful, + "no_plan_success_rate": no_plan_successful / no_plan_total if no_plan_total > 0 else 0, + "high_depth_total": high_depth_total, + "high_depth_successful": high_depth_successful, + "high_depth_success_rate": high_depth_successful / high_depth_total if high_depth_total > 0 else 0 + } + +def get_immediate_subdirectories(a_dir): + return [os.path.join(a_dir, name) for name in os.listdir(a_dir) + if os.path.isdir(os.path.join(a_dir, name))] + + +# --- Main Execution --- +if __name__ == "__main__": + # 1. Download folders from AWS + parser = argparse.ArgumentParser() + parser.add_argument('--s3_download', default=False, type=bool, help='Download folders from S3') + parser.add_argument('--aws_bucket_name', default="mindcraft" , type=str, help='AWS bucket name') + parser.add_argument('--s3_folder_prefix', default="experiments/4o_craft_better_tasks_03-02_07-15/", type=str, help='S3 folder prefix') + parser.add_argument('--local_download_dir', default="results/4o_craft_better_tasks_03-02_07-15/", type=str, help='Local download directory') + args = parser.parse_args() + + AWS_BUCKET_NAME = args.aws_bucket_name + S3_FOLDER_PREFIX = args.s3_folder_prefix + LOCAL_DOWNLOAD_DIR = args.local_download_dir + + if (args.s3_download): + print(f"Downloading folders from s3://{args.aws_bucket_name}/{args.s3_folder_prefix} to {args.local_download_dir}...") + folders = download_s3_folders(args.aws_bucket_name, args.s3_folder_prefix, args.local_download_dir) + else: + folders = get_immediate_subdirectories(args.local_download_dir) + print(folders) + results = aggregate_results(folders) + print(results) +# Save results to a file +with open(args.local_download_dir + "/results.txt", "w") as file: + file.write("Results\n") + for key, value in results.items(): + file.write(f"{key}: {value}\n") +print("Results saved to results.txt") + # if not downloaded_local_folders: + # print("No folders downloaded. Exiting.") + # exit() + + # print("\n--- Analyzing downloaded files ---") + # # 2. & 3. Analyze files and aggregate results + # results = aggregate_results(downloaded_local_folders) + + # print("\n--- Aggregated Results ---") + # for folder, outcome in results.items(): + # print(f"Folder: {folder} -> {outcome}") + + # Optional: Clean up downloaded files + # import shutil + # shutil.rmtree(LOCAL_DOWNLOAD_DIR) + # print(f"\nCleaned up {LOCAL_DOWNLOAD_DIR}") \ No newline at end of file diff --git a/results/4o_craft_better_tasks_03-02_07-15/results.txt b/results/4o_craft_better_tasks_03-02_07-15/results.txt new file mode 100644 index 0000000..894ac0b --- /dev/null +++ b/results/4o_craft_better_tasks_03-02_07-15/results.txt @@ -0,0 +1,25 @@ +Results +total: 823 +successful: 196 +success_rate: 0.23815309842041313 +base_total: 69 +base_successful: 20 +base_success_rate: 0.2898550724637681 +base_no_plan_total: 27 +base_no_plan_successful: 10 +base_no_plan_success_rate: 0.37037037037037035 +missing_total: 375 +missing_successful: 72 +missing_success_rate: 0.192 +full_plan_total: 196 +full_plan_successful: 39 +full_plan_success_rate: 0.1989795918367347 +partial_plan_total: 282 +partial_plan_successful: 62 +partial_plan_success_rate: 0.2198581560283688 +no_plan_total: 276 +no_plan_successful: 75 +no_plan_success_rate: 0.2717391304347826 +high_depth_total: 505 +high_depth_successful: 90 +high_depth_success_rate: 0.1782178217821782 diff --git a/results/4o_craft_better_tasks_03-02_07-15results.txt b/results/4o_craft_better_tasks_03-02_07-15results.txt new file mode 100644 index 0000000..8740300 --- /dev/null +++ b/results/4o_craft_better_tasks_03-02_07-15results.txt @@ -0,0 +1,21 @@ +Folder: total -> 304 +Folder: successful -> 77 +Folder: success_rate -> 0.2532894736842105 +Folder: base_total -> 39 +Folder: base_successful -> 12 +Folder: base_success_rate -> 0.3076923076923077 +Folder: missing_total -> 102 +Folder: missing_successful -> 19 +Folder: missing_success_rate -> 0.18627450980392157 +Folder: full_plan_total -> 57 +Folder: full_plan_successful -> 9 +Folder: full_plan_success_rate -> 0.15789473684210525 +Folder: partial_plan_total -> 108 +Folder: partial_plan_successful -> 26 +Folder: partial_plan_success_rate -> 0.24074074074074073 +Folder: no_plan_total -> 99 +Folder: no_plan_successful -> 30 +Folder: no_plan_success_rate -> 0.30303030303030304 +Folder: high_depth_total -> 165 +Folder: high_depth_successful -> 29 +Folder: high_depth_success_rate -> 0.17575757575757575 diff --git a/results/clauderesults.txt b/results/clauderesults.txt new file mode 100644 index 0000000..3cc50c1 --- /dev/null +++ b/results/clauderesults.txt @@ -0,0 +1,21 @@ +Folder: total -> 601 +Folder: successful -> 210 +Folder: success_rate -> 0.34941763727121466 +Folder: base_total -> 69 +Folder: base_successful -> 31 +Folder: base_success_rate -> 0.4492753623188406 +Folder: missing_total -> 232 +Folder: missing_successful -> 87 +Folder: missing_success_rate -> 0.375 +Folder: full_plan_total -> 126 +Folder: full_plan_successful -> 47 +Folder: full_plan_success_rate -> 0.373015873015873 +Folder: partial_plan_total -> 201 +Folder: partial_plan_successful -> 62 +Folder: partial_plan_success_rate -> 0.30845771144278605 +Folder: no_plan_total -> 205 +Folder: no_plan_successful -> 70 +Folder: no_plan_success_rate -> 0.34146341463414637 +Folder: high_depth_total -> 292 +Folder: high_depth_successful -> 55 +Folder: high_depth_success_rate -> 0.18835616438356165 diff --git a/src/agent/agent.js b/src/agent/agent.js index c4e928d..71aaf38 100644 --- a/src/agent/agent.js +++ b/src/agent/agent.js @@ -64,37 +64,6 @@ export class Agent { save_data = this.history.load(); } - - if (this.task) { - this.task.loadTask(task_path, task_id); - this.taskTimeout = this.task.timeout || 300; - this.taskStartTime = Date.now(); - if (this.task.type === 'harvest' || this.task.type === 'techtree') { - // todo: this validator doesn't exist? - // this.validator = new TechTreeHarvestValidator(this.task, this.bot); - } - // this.validator = new TechTreeHarvestValidator(this.task, this.bot); - - } else { - console.log('called without task') - this.task = null; - this.taskTimeout = null; - this.validator = null; - } - - // handle blocked actions - if (this.task && "blocked_actions" in this.task) { - if ("agent_number" in this.task && this.task.agent_number > 1) { - this.blocked_actions = this.task.blocked_actions[this.name]; - console.log(`Blocked actions for ${this.name}:`, this.blocked_actions); - } else { - this.blocked_actions = this.task.blocked_actions; - console.log(`Blocked actions:`, this.blocked_actions); - } - } - - console.log("Is validated:", this.validator && this.validator.validate()); - this.bot.on('login', () => { console.log(this.name, 'logged in!');