reworked image prompting, update package

2025-07-30 20:05:29 +02:00 · 2025-03-05 15:23:57 -06:00 · 2025-03-05 15:23:57 -06:00 · 6ec49e7789
commit 6ec49e7789
parent 359592825a
5 changed files with 57 additions and 73 deletions
--- a/package.json
+++ b/package.json
@ -5,28 +5,28 @@
        "@google/generative-ai": "^0.2.1",
        "@huggingface/inference": "^2.8.1",
        "@mistralai/mistralai": "^1.1.0",
+        "express": "^4.18.2",
        "google-translate-api-x": "^10.7.1",
        "groq-sdk": "^0.5.0",
        "minecraft-data": "^3.78.0",
-        "mineflayer": "^4.23.0",
+        "mineflayer": "^4.26.0",
        "mineflayer-armor-manager": "^2.0.1",
        "mineflayer-auto-eat": "^3.3.6",
        "mineflayer-collectblock": "^1.4.1",
        "mineflayer-pathfinder": "^2.4.5",
        "mineflayer-pvp": "^1.3.2",
+        "node-canvas-webgl": "PrismarineJS/node-canvas-webgl",
        "openai": "^4.4.0",
        "patch-package": "^8.0.0",
        "prismarine-item": "^1.15.0",
        "prismarine-viewer": "^1.28.0",
        "replicate": "^0.29.4",
        "ses": "^1.9.1",
-        "vec3": "^0.1.10",
-        "yargs": "^17.7.2",
        "socket.io": "^4.7.2",
        "socket.io-client": "^4.7.2",
-        "express": "^4.18.2",
-        "node-canvas-webgl": "PrismarineJS/node-canvas-webgl",
-        "three": "^0.128.0"
+        "three": "^0.128.0",
+        "vec3": "^0.1.10",
+        "yargs": "^17.7.2"
    },
    "scripts": {
        "postinstall": "patch-package",
--- a/profiles/defaults/_default.json
+++ b/profiles/defaults/_default.json
@ -9,7 +9,7 @@
    
    "bot_responder": "You are a minecraft bot named $NAME that is currently in conversation with another AI bot. Both of you can take actions with the !command syntax, and actions take time to complete. You are currently busy with the following action: '$ACTION' but have received a new message. Decide whether to 'respond' immediately or 'ignore' it and wait for your current action to finish. Be conservative and only respond when necessary, like when you need to change/stop your action, or convey necessary information. Example 1: You:Building a house! !newAction('Build a house.').\nOther Bot: 'Come here!'\nYour decision: ignore\nExample 2: You:Collecting dirt !collectBlocks('dirt',10).\nOther Bot: 'No, collect some wood instead.'\nYour decision: respond\nExample 3: You:Coming to you now. !goToPlayer('billy',3).\nOther Bot: 'What biome are you in?'\nYour decision: respond\nActual Conversation: $TO_SUMMARIZE\nDecide by outputting ONLY 'respond' or 'ignore', nothing else. Your decision:",

-    "image_conversing": "Summarize the visible Minecraft screen by listing the types and arrangements of blocks, focusing on terrain, structures, and notable features in 400 chars.",
+    "image_analysis": "You are a Minecraft bot named $NAME that has been given a screenshot of your current view. Analyze and summarize the view; describe terrain, blocks, structures, and notable features. Focus on details relevant to the conversation. Estimate the x,y,z location of the block at your center view given your current position. Be extremely concise and correct, respond only with your analysis, not conversationally. $STATS",

    "modes": {
        "self_preservation": true,
--- a/src/agent/commands/actions.js
+++ b/src/agent/commands/actions.js
@ -411,19 +411,23 @@ export const actionsList = [
        name: '!lookAtPlayer',
        description: 'Look at a player or look in the same direction as the player.',
        params: {
-            'player_name': {
-                type: 'string',
-                description: 'Name of the target player'
-            },
+            'player_name': { type: 'string', description: 'Name of the target player' },
            'direction': {
                type: 'string',
                description: 'How to look ("at": look at the player, "with": look in the same direction as the player)',
-                enum: ['at', 'with']
            }
        },
-        perform: runAsAction(async (agent, player_name, direction) => {
-            await agent.vision_interpreter.lookAtPlayer(player_name, direction);
-        })
+        perform: async function(agent, player_name, direction) {
+            if (direction !== 'at' && direction !== 'with') {
+                return "Invalid direction. Use 'at' or 'with'.";
+            }
+            let result = "";
+            const actionFn = async () => {
+                result = await agent.vision_interpreter.lookAtPlayer(player_name, direction);
+            };
+            await agent.actions.runAction('action:lookAtPlayer', actionFn);
+            return result;
+        }
    },
    {
        name: '!lookAtPosition',
@ -433,8 +437,13 @@ export const actionsList = [
            'y': { type: 'int', description: 'y coordinate' },
            'z': { type: 'int', description: 'z coordinate' }
        },
-        perform: runAsAction(async (agent, x, y, z) => {
-            await agent.vision_interpreter.lookAtPosition(x, y, z);
-        })
+        perform: async function(agent, x, y, z) {
+            let result = "";
+            const actionFn = async () => {
+                result = await agent.vision_interpreter.lookAtPosition(x, y, z);
+            };
+            await agent.actions.runAction('action:lookAtPosition', actionFn);
+            return result;
+        }
    }
 ];
--- a/src/agent/vision_interpreter.js
+++ b/src/agent/vision_interpreter.js
@ -1,12 +1,8 @@
 import { Vec3 } from 'vec3';
 import { Camera } from "../utils/camera.js";
 import fs from 'fs';
-import { log } from './library/skills.js';
-import * as world from './library/world.js';

-const pad = (str) => {
-    return '\n' + str + '\n';
-}
+const RENDER_TIME = 1000;

 export class VisionInterpreter {
    constructor(agent, allow_vision) {
@ -16,89 +12,61 @@ export class VisionInterpreter {
    }

    async lookAtPlayer(player_name, direction) {
+        if (!this.allow_vision || !this.agent.prompter.vision_model.sendVisionRequest) {
+            return "Vision is disabled. Use other methods to describe the environment.";
+        }
+        let result = "";
        const bot = this.agent.bot;
        const player = bot.players[player_name]?.entity;
        if (!player) {
-            log(bot, `Could not find player ${player_name}`);
+            return `Could not find player ${player_name}`;
        }

        let filename;
        if (direction === 'with') {
            await bot.look(player.yaw, player.pitch);
            const camera = new Camera(bot, this.fp);
-            await new Promise(resolve => setTimeout(resolve, 500));
-            log(bot, `Looking in the same direction as ${player_name}`);
+            await new Promise(resolve => setTimeout(resolve, RENDER_TIME));
+            result = `Looking in the same direction as ${player_name}\n`;
            filename = await camera.capture();
        } else {
            await bot.lookAt(new Vec3(player.position.x, player.position.y + player.height, player.position.z));
            const camera = new Camera(bot, this.fp);
-            await new Promise(resolve => setTimeout(resolve, 500));
-            log(bot, `Looking at player ${player_name}`);
+            await new Promise(resolve => setTimeout(resolve, RENDER_TIME));
+            result = `Looking at player ${player_name}\n`;
            filename = await camera.capture();
+
        }

-        if (!this.allow_vision || !this.agent.prompter.vision_model.sendVisionRequest) {
-            log(this.agent.bot, "Vision is disabled. Using text-based environment description instead.");
-            log(this.agent.bot, this._nearbyBlocks());
-        } else {
-            await this.analyzeImage(filename);
-        }
+        return result + `Image analysis: "${await this.analyzeImage(filename)}"`;
    }

    async lookAtPosition(x, y, z) {
+        if (!this.allow_vision || !this.agent.prompter.vision_model.sendVisionRequest) {
+            return "Vision is disabled. Use other methods to describe the environment.";
+        }
+        let result = "";
        const bot = this.agent.bot;
        await bot.lookAt(new Vec3(x, y + 2, z));
        const camera = new Camera(bot, this.fp);
-        await new Promise(resolve => setTimeout(resolve, 500));
-        log(bot, `Looking at coordinate ${x, y, z}`);
+        await new Promise(resolve => setTimeout(resolve, RENDER_TIME));
+        result = `Looking at coordinate ${x, y, z}\n`;

        let filename = await camera.capture();

-        if (!this.allow_vision || !this.agent.prompter.vision_model.sendVisionRequest) {
-            log(this.agent.bot, "Vision is disabled. Using text-based environment description instead.");
-            log(this.agent.bot, this._nearbyBlocks());
-        } else {
-            await this.analyzeImage(filename);
-        }
+        return result + `Image analysis: "${await this.analyzeImage(filename)}"`;
    }

    async analyzeImage(filename) {
-        let prompt = this.agent.prompter.profile.image_conversing;
-        let res = null;
-
        try {
-            const bot = this.agent.bot;
            const imageBuffer = fs.readFileSync(`${this.fp}/${filename}.jpg`);
            const messages = this.agent.history.getHistory();
-            res = await this.agent.prompter.vision_model.sendVisionRequest(messages, prompt, imageBuffer);
-            
-            if (res == 'Vision is only supported by certain models.') {
-                log(bot, "Vision may not be supported on this model. Using text-based environment description instead.");
-                log(bot, this._nearbyBlocks());
-            } else {
-                log(bot, res);
-            }
+
+            return await this.agent.prompter.promptVision(messages, imageBuffer);

        } catch (error) {
-            log(this.agent.bot, `Error analyzing image: ${error.message}`);
+            console.warn('Error reading image:', error);
+            return `Error reading image: ${error.message}`;
        }
    }
-
-    _nearbyBlocks() {
-        const bot = this.agent.bot;
-        let res = 'NEARBY_BLOCKS';
-        
-        let blocks = world.getNearbyBlockTypes(bot);
-        for (let i = 0; i < blocks.length; i++) {
-            res += `\n- ${blocks[i]}`;
-        }
-        if (blocks.length == 0) {
-            res += ': none';
-        } else {
-            // Environmental Awareness
-            res += '\n- ' + world.getSurroundingBlocks(bot).join('\n- ')
-            res += `\n- First Solid Block Above Head: ${world.getFirstBlockAboveHead(bot, null, 32)}`;
-        }        
-        return pad(res);
-    }
 } 
--- a/src/models/prompter.js
+++ b/src/models/prompter.js
@ -343,6 +343,13 @@ git     }
        return res.trim().toLowerCase() === 'respond';
    }

+    async promptVision(messages, imageBuffer) {
+        await this.checkCooldown();
+        let prompt = this.profile.image_analysis;
+        prompt = await this.replaceStrings(prompt, messages, null, null, null);
+        return await this.vision_model.sendVisionRequest(messages, prompt, imageBuffer);
+    }
+
    async promptGoalSetting(messages, last_goals) {
        let system_message = this.profile.goal_setting;
        system_message = await this.replaceStrings(system_message, messages);