a script to analyze results and a small merge clean up

2025-08-09 08:45:33 +02:00 · 2025-03-05 15:20:16 -08:00 · 2025-03-05 15:20:16 -08:00 · 5da24ecf79
commit 5da24ecf79
parent 7e7f893cf3
5 changed files with 325 additions and 31 deletions
--- a/analyse_results.py
+++ b/analyse_results.py
@ -0,0 +1,258 @@
+import boto3
+import os
+import json
+import re
+from botocore.exceptions import ClientError
+import json
+import argparse
+from tqdm import tqdm
+import glob
+
+def download_s3_folders(bucket_name, s3_prefix, local_base_dir):
+    """
+    Downloads groups of folders from S3 based on the next level of prefixes.
+
+    Args:
+        bucket_name (str): Name of the S3 bucket.
+        s3_prefix (str): Prefix where the folders are located (e.g., 'my-experiments/').
+        local_base_dir (str): Local directory to download the folders to.
+
+    Returns:
+        list: List of downloaded local folder paths.
+    """
+    s3_client = boto3.client('s3')
+    downloaded_folders = []
+
+    try:
+        # List objects with the prefix, delimited by '/' to find sub-prefixes (folders)
+        response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=s3_prefix, Delimiter='/')
+
+        if 'CommonPrefixes' not in response:
+            print(f"No folders found under s3://{bucket_name}/{s3_prefix}")
+            return downloaded_folders
+
+        s3_folder_prefixes = [prefix['Prefix'] for prefix in response['CommonPrefixes']]
+        subfolder = s3_prefix.split('/')[-2]
+
+        for s3_folder_prefix in tqdm(s3_folder_prefixes):
+            folder_name = s3_folder_prefix.split('/')[-2] # Extract folder name
+            local_folder_path = os.path.join(local_base_dir, subfolder, folder_name)
+            os.makedirs(local_folder_path, exist_ok=True)
+            downloaded_folders.append(local_folder_path)
+
+            # Download files within the folder
+            objects_in_folder = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder_prefix)
+            if 'Contents' in objects_in_folder:
+                for obj in objects_in_folder['Contents']:
+                    s3_key = obj['Key']
+                    if s3_key.endswith(('.json')): # Only download json files
+                        local_file_path = os.path.join(local_folder_path, os.path.basename(s3_key))
+                        try:
+                            s3_client.download_file(bucket_name, s3_key, local_file_path)
+                        except Exception as e:
+                            print(f"Error downloading {s3_key}: {e}")
+            
+            else:
+                print(f"No files found in {s3_folder_prefix}")
+
+    except ClientError as e:
+        print(f"Error accessing S3: {e}")
+        return []
+
+    return downloaded_folders
+
+def analyze_json_file(file_path):
+    """
+    Analyzes a single JSON file to extract the task outcome.
+
+    Args:
+        file_path (str): Path to the JSON file.
+
+    Returns:
+        str or None: The task outcome string if found, otherwise None.
+    """
+    try:
+        with open(file_path, 'r') as f:
+            data = json.load(f)
+            if 'turns' in data and isinstance(data['turns'], list):
+                for turn in reversed(data['turns']):  # Check turns from the end
+                    if turn.get('role') == 'system' and isinstance(turn.get('content'), str):
+                        if "Task successful ended with code : 2" in turn['content']:
+                            return True
+        return False
+    except FileNotFoundError:
+        print(f"Error: File not found: {file_path}")
+        return None
+    except json.JSONDecodeError:
+        print(f"Error: Invalid JSON format in: {file_path}")
+        return None
+    except Exception as e:
+        print(f"An unexpected error occurred while processing {file_path}: {e}")
+        return None
+
+def extract_result(folder_path):
+    folder_name = os.path.basename(folder_path)
+    json_files = glob.glob(os.path.join(folder_path, "*.json"))
+    assert len(json_files) == 2, f"Expected 2 json files in {folder_name}, found {len(json_files)}"
+
+    if not json_files:
+        print(f"No JSON files found in {folder_name}")
+        return None
+    else: 
+        outcome = False
+        for json_file in json_files:
+            outcome = analyze_json_file(json_file)
+            if outcome:
+                return True
+        return False
+    
+def is_base(folder_path):
+    return "full_plan" in folder_path and "depth_0" in folder_path and "missing" not in folder_path
+
+def base_without_plan(folder_path):
+    return "no_plan" in folder_path and "depth_0" in folder_path and "missing" in folder_path
+
+def aggregate_results(local_folders):
+    """
+    Aggregates the analysis results for each folder.
+
+    Args:
+        local_folders (list): List of local folder paths containing the JSON files.
+
+    Returns:
+        dict: A dictionary where keys are folder names and values are the aggregated outcomes.
+    """
+    aggregated_data = {}
+
+    total = 0
+    successful = 0
+
+    base_successful = 0
+    base_total = 0
+
+    base_no_plan_successful = 0
+    base_no_plan_total = 0
+
+    missing_successful = 0
+    missing_total = 0
+
+    full_plan_successful = 0
+    full_plan_total = 0
+
+    partial_plan_successful = 0
+    partial_plan_total = 0
+
+    no_plan_successful = 0
+    no_plan_total = 0
+
+    high_depth_successful = 0
+    high_depth_total = 0
+    for folder_path in tqdm(local_folders):
+        folder_name = os.path.basename(folder_path)
+
+        try: 
+            total += 1
+            result = extract_result(folder_path)
+            success = int(extract_result(folder_path))
+            successful += success
+
+            if "missing" in folder_path:
+                missing_successful += success
+                missing_total += 1
+            if is_base(folder_path):
+                base_successful += success
+                base_total += 1
+            if base_without_plan(folder_path):
+                base_no_plan_successful += success
+                base_no_plan_total += 1
+            if "full_plan" in folder_path and not is_base(folder_path):
+                full_plan_successful += success
+                full_plan_total += 1
+            if "partial_plan" in folder_path and not is_base(folder_path):
+                partial_plan_successful += success
+                partial_plan_total += 1
+            if "no_plan" in folder_path and not is_base(folder_path):
+                no_plan_successful += success
+                no_plan_total += 1
+            if "depth_1" in folder_path or "depth_2" in folder_path and not is_base(folder_path):
+                high_depth_successful += success
+                high_depth_total += 1
+        except Exception as e:
+            print(f"Error processing {folder_name}: {e}")
+    
+    return {
+        "total": total,
+        "successful": successful,
+        "success_rate": successful / total if total > 0 else 0,
+        "base_total": base_total,
+        "base_successful": base_successful,
+        "base_success_rate": base_successful / base_total if base_total > 0 else 0,
+        "base_no_plan_total": base_no_plan_total,
+        "base_no_plan_successful": base_no_plan_successful,
+        "base_no_plan_success_rate": base_no_plan_successful / base_no_plan_total if base_no_plan_total > 0 else 0,
+        "missing_total": missing_total,
+        "missing_successful": missing_successful,
+        "missing_success_rate": missing_successful / missing_total if missing_total > 0 else 0,
+        "full_plan_total": full_plan_total,
+        "full_plan_successful": full_plan_successful,
+        "full_plan_success_rate": full_plan_successful / full_plan_total if full_plan_total > 0 else 0,
+        "partial_plan_total": partial_plan_total,
+        "partial_plan_successful": partial_plan_successful,
+        "partial_plan_success_rate": partial_plan_successful / partial_plan_total if partial_plan_total > 0 else 0,
+        "no_plan_total": no_plan_total,
+        "no_plan_successful": no_plan_successful,
+        "no_plan_success_rate": no_plan_successful / no_plan_total if no_plan_total > 0 else 0,
+        "high_depth_total": high_depth_total,
+        "high_depth_successful": high_depth_successful,
+        "high_depth_success_rate": high_depth_successful / high_depth_total if high_depth_total > 0 else 0
+    }
+
+def get_immediate_subdirectories(a_dir):
+    return [os.path.join(a_dir, name) for name in os.listdir(a_dir)
+            if os.path.isdir(os.path.join(a_dir, name))]
+
+
+# --- Main Execution ---
+if __name__ == "__main__":
+    # 1. Download folders from AWS
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--s3_download', default=False, type=bool, help='Download folders from S3')
+    parser.add_argument('--aws_bucket_name', default="mindcraft" , type=str, help='AWS bucket name')
+    parser.add_argument('--s3_folder_prefix', default="experiments/4o_craft_better_tasks_03-02_07-15/", type=str, help='S3 folder prefix')
+    parser.add_argument('--local_download_dir', default="results/4o_craft_better_tasks_03-02_07-15/", type=str, help='Local download directory')
+    args = parser.parse_args()
+
+    AWS_BUCKET_NAME = args.aws_bucket_name
+    S3_FOLDER_PREFIX = args.s3_folder_prefix
+    LOCAL_DOWNLOAD_DIR = args.local_download_dir
+
+    if (args.s3_download):
+        print(f"Downloading folders from s3://{args.aws_bucket_name}/{args.s3_folder_prefix} to {args.local_download_dir}...")
+        folders = download_s3_folders(args.aws_bucket_name, args.s3_folder_prefix, args.local_download_dir)
+    else: 
+        folders = get_immediate_subdirectories(args.local_download_dir)
+        print(folders)
+    results = aggregate_results(folders)
+    print(results)
+# Save results to a file
+with open(args.local_download_dir + "/results.txt", "w") as file:
+    file.write("Results\n")
+    for key, value in results.items():
+        file.write(f"{key}: {value}\n")
+print("Results saved to results.txt")
+    # if not downloaded_local_folders:
+    #     print("No folders downloaded. Exiting.")
+    #     exit()
+
+    # print("\n--- Analyzing downloaded files ---")
+    # # 2. & 3. Analyze files and aggregate results
+    # results = aggregate_results(downloaded_local_folders)
+
+    # print("\n--- Aggregated Results ---")
+    # for folder, outcome in results.items():
+    #     print(f"Folder: {folder} -> {outcome}")
+
+    # Optional: Clean up downloaded files
+    # import shutil
+    # shutil.rmtree(LOCAL_DOWNLOAD_DIR)
+    # print(f"\nCleaned up {LOCAL_DOWNLOAD_DIR}")
--- a/results/4o_craft_better_tasks_03-02_07-15/results.txt
+++ b/results/4o_craft_better_tasks_03-02_07-15/results.txt
@ -0,0 +1,25 @@
+Results
+total: 823
+successful: 196
+success_rate: 0.23815309842041313
+base_total: 69
+base_successful: 20
+base_success_rate: 0.2898550724637681
+base_no_plan_total: 27
+base_no_plan_successful: 10
+base_no_plan_success_rate: 0.37037037037037035
+missing_total: 375
+missing_successful: 72
+missing_success_rate: 0.192
+full_plan_total: 196
+full_plan_successful: 39
+full_plan_success_rate: 0.1989795918367347
+partial_plan_total: 282
+partial_plan_successful: 62
+partial_plan_success_rate: 0.2198581560283688
+no_plan_total: 276
+no_plan_successful: 75
+no_plan_success_rate: 0.2717391304347826
+high_depth_total: 505
+high_depth_successful: 90
+high_depth_success_rate: 0.1782178217821782
--- a/results/4o_craft_better_tasks_03-02_07-15results.txt
+++ b/results/4o_craft_better_tasks_03-02_07-15results.txt
@ -0,0 +1,21 @@
+Folder: total -> 304
+Folder: successful -> 77
+Folder: success_rate -> 0.2532894736842105
+Folder: base_total -> 39
+Folder: base_successful -> 12
+Folder: base_success_rate -> 0.3076923076923077
+Folder: missing_total -> 102
+Folder: missing_successful -> 19
+Folder: missing_success_rate -> 0.18627450980392157
+Folder: full_plan_total -> 57
+Folder: full_plan_successful -> 9
+Folder: full_plan_success_rate -> 0.15789473684210525
+Folder: partial_plan_total -> 108
+Folder: partial_plan_successful -> 26
+Folder: partial_plan_success_rate -> 0.24074074074074073
+Folder: no_plan_total -> 99
+Folder: no_plan_successful -> 30
+Folder: no_plan_success_rate -> 0.30303030303030304
+Folder: high_depth_total -> 165
+Folder: high_depth_successful -> 29
+Folder: high_depth_success_rate -> 0.17575757575757575
--- a/results/clauderesults.txt
+++ b/results/clauderesults.txt
@ -0,0 +1,21 @@
+Folder: total -> 601
+Folder: successful -> 210
+Folder: success_rate -> 0.34941763727121466
+Folder: base_total -> 69
+Folder: base_successful -> 31
+Folder: base_success_rate -> 0.4492753623188406
+Folder: missing_total -> 232
+Folder: missing_successful -> 87
+Folder: missing_success_rate -> 0.375
+Folder: full_plan_total -> 126
+Folder: full_plan_successful -> 47
+Folder: full_plan_success_rate -> 0.373015873015873
+Folder: partial_plan_total -> 201
+Folder: partial_plan_successful -> 62
+Folder: partial_plan_success_rate -> 0.30845771144278605
+Folder: no_plan_total -> 205
+Folder: no_plan_successful -> 70
+Folder: no_plan_success_rate -> 0.34146341463414637
+Folder: high_depth_total -> 292
+Folder: high_depth_successful -> 55
+Folder: high_depth_success_rate -> 0.18835616438356165
--- a/src/agent/agent.js
+++ b/src/agent/agent.js
@ -64,37 +64,6 @@ export class Agent {
                save_data = this.history.load();
            }

-
-            if (this.task) {
-                this.task.loadTask(task_path, task_id);
-                this.taskTimeout = this.task.timeout || 300;
-                this.taskStartTime = Date.now();
-                if (this.task.type === 'harvest' || this.task.type === 'techtree') {
-                    // todo: this validator doesn't exist?
-                    // this.validator = new TechTreeHarvestValidator(this.task, this.bot);
-                }
-                // this.validator = new TechTreeHarvestValidator(this.task, this.bot);
-
-            } else {
-                console.log('called without task')
-                this.task = null;
-                this.taskTimeout = null;
-                this.validator = null;
-            }
-
-            // handle blocked actions
-            if (this.task && "blocked_actions" in this.task) {
-                if ("agent_number" in this.task && this.task.agent_number > 1) {
-                    this.blocked_actions = this.task.blocked_actions[this.name];
-                    console.log(`Blocked actions for ${this.name}:`, this.blocked_actions);
-                } else {
-                    this.blocked_actions = this.task.blocked_actions;
-                    console.log(`Blocked actions:`, this.blocked_actions);
-                }
-            }
-
-            console.log("Is validated:", this.validator && this.validator.validate());
-
            this.bot.on('login', () => {
                console.log(this.name, 'logged in!');