feat: remove promptImageConvo and implement sendVisionRequest to each provider

2025-07-30 03:45:36 +02:00 · 2025-01-24 16:29:03 +09:00 · 2025-01-24 16:29:03 +09:00 · 7d51726289
commit 7d51726289
parent 5fce0acaac
6 changed files with 44 additions and 28 deletions
--- a/profiles/_default.json
+++ b/profiles/_default.json
@ -9,7 +9,7 @@
    
    "bot_responder": "You are a minecraft bot named $NAME that is currently in conversation with another AI bot. Both of you can take actions with the !command syntax, and actions take time to complete. You are currently busy with the following action: '$ACTION' but have received a new message. Decide whether to 'respond' immediately or 'ignore' it and wait for your current action to finish. Be conservative and only respond when necessary, like when you need to change/stop your action, or convey necessary information. Example 1: You:Building a house! !newAction('Build a house.').\nOther Bot: 'Come here!'\nYour decision: ignore\nExample 2: You:Collecting dirt !collectBlocks('dirt',10).\nOther Bot: 'No, collect some wood instead.'\nYour decision: respond\nExample 3: You:Coming to you now. !goToPlayer('billy',3).\nOther Bot: 'What biome are you in?'\nYour decision: respond\nActual Conversation: $TO_SUMMARIZE\nDecide by outputting ONLY 'respond' or 'ignore', nothing else. Your decision:",

-    "image_conversing": "Summarize the visible Minecraft screen by listing the types and arrangements of blocks, focusing on terrain, structures, and notable features in 500 chars.",
+    "image_conversing": "Summarize the visible Minecraft screen by listing the types and arrangements of blocks, focusing on terrain, structures, and notable features in 400 chars.",

    "modes": {
        "self_preservation": true,
--- a/src/agent/prompter.js
+++ b/src/agent/prompter.js
@ -264,29 +264,6 @@ export class Prompter {
        return '';
    }

-    async promptImageConvo(messages, imageBuffer) {
-        await this.checkCooldown();
-        let prompt = this.profile.image_conversing;
-        let imageMessages = [...messages];
-        imageMessages.push({
-            role: "user", 
-            content: [
-                { type: "text", text: prompt },
-                {
-                    type: "image_url",
-                    image_url: {
-                        "url": `data:image/jpeg;base64,${imageBuffer.toString('base64')}`,
-                    }
-                }
-            ]
-        });
-
-        return await this.chat_model.sendRequest(
-            imageMessages,
-            prompt
-        );
-    }
-
    async promptCoding(messages) {
        if (this.awaiting_coding) {
            console.warn('Already awaiting coding response, returning no response.');
--- a/src/agent/vision_interpreter.js
+++ b/src/agent/vision_interpreter.js
@ -37,7 +37,7 @@ export class VisionInterpreter {
            filename = await camera.capture();
        }

-        if (!this.allow_vision) {
+        if (!this.allow_vision || !this.agent.prompter.chat_model.sendVisionRequest) {
            log(this.agent.bot, "Vision is disabled. Using text-based environment description instead.");
            log(this.agent.bot, this._nearbyBlocks());
        } else {
@ -54,7 +54,7 @@ export class VisionInterpreter {

        let filename = await camera.capture();

-        if (!this.allow_vision) {
+        if (!this.allow_vision || !this.agent.prompter.chat_model.sendVisionRequest) {
            log(this.agent.bot, "Vision is disabled. Using text-based environment description instead.");
            log(this.agent.bot, this._nearbyBlocks());
        } else {
@ -63,12 +63,14 @@ export class VisionInterpreter {
    }

    async analyzeImage(filename) {
+        let prompt = this.agent.prompter.profile.image_conversing;
        let res = null;
+
        try {
            const bot = this.agent.bot;
            const imageBuffer = fs.readFileSync(`${this.fp}/${filename}.jpg`);
            const messages = this.agent.history.getHistory();
-            res = await this.agent.prompter.promptImageConvo(messages, imageBuffer);
+            res = await this.agent.prompter.chat_model.sendVisionRequest(messages, prompt, imageBuffer);
            log(bot, res);
        } catch (error) {
            log(this.agent.bot, `Error analyzing image: ${error.message}`);
--- a/src/models/gpt.js
+++ b/src/models/gpt.js
@ -53,6 +53,24 @@ export class GPT {
        return res;
    }

+    async sendVisionRequest(messages, systemMessage, imageBuffer) {
+        const imageMessages = [...messages];
+        imageMessages.push({
+            role: "user",
+            content: [
+                { type: "text", text: systemMessage },
+                {
+                    type: "image_url",
+                    image_url: {
+                        url: `data:image/jpeg;base64,${imageBuffer.toString('base64')}`
+                    }
+                }
+            ]
+        });
+        
+        return this.sendRequest(imageMessages, systemMessage);
+    }
+
    async embed(text) {
        const embedding = await this.openai.embeddings.create({
            model: this.model_name || "text-embedding-3-small",
@ -61,6 +79,7 @@ export class GPT {
        });
        return embedding.data[0].embedding;
    }
+
 }


--- a/src/models/mistral.js
+++ b/src/models/mistral.js
@ -61,6 +61,22 @@ export class Mistral {
        return result;
    }

+    async sendVisionRequest(messages, systemMessage, imageBuffer) {
+        const imageMessages = [...messages];
+        imageMessages.push({
+            role: "user",
+            content: [
+                { type: "text", text: systemMessage },
+                {
+                    type: "image_url",
+                    imageUrl: `data:image/jpeg;base64,${imageBuffer.toString('base64')}`
+                }
+            ]
+        });
+        
+        return this.sendRequest(imageMessages, systemMessage);
+    }
+
    async embed(text) {
        const embedding = await this.#client.embeddings.create({
            model: "mistral-embed",
--- a/src/utils/text.js
+++ b/src/utils/text.js
@ -33,7 +33,9 @@ export function strictFormat(turns) {
    let messages = [];
    let filler = {role: 'user', content: '_'};
    for (let msg of turns) {
-        msg.content = msg.content.trim();
+        if (typeof msg.content === 'string')  {
+            msg.content = msg.content.trim();
+        }
        if (msg.role === 'system') {
            msg.role = 'user';
            msg.content = 'SYSTEM: ' + msg.content;