From 7d5172628913a09407618cb0b86f59085479f59c Mon Sep 17 00:00:00 2001
From: gmuffiness <gmuffiness@gmail.com>
Date: Fri, 24 Jan 2025 16:29:03 +0900
Subject: [PATCH] feat: remove promptImageConvo and implement sendVisionRequest
 to each provider

---
 profiles/_default.json          |  2 +-
 src/agent/prompter.js           | 23 -----------------------
 src/agent/vision_interpreter.js |  8 +++++---
 src/models/gpt.js               | 19 +++++++++++++++++++
 src/models/mistral.js           | 16 ++++++++++++++++
 src/utils/text.js               |  4 +++-
 6 files changed, 44 insertions(+), 28 deletions(-)

diff --git a/profiles/_default.json b/profiles/_default.json
index 474b407..fd7b1c2 100644
--- a/profiles/_default.json
+++ b/profiles/_default.json
@@ -9,7 +9,7 @@
     
     "bot_responder": "You are a minecraft bot named $NAME that is currently in conversation with another AI bot. Both of you can take actions with the !command syntax, and actions take time to complete. You are currently busy with the following action: '$ACTION' but have received a new message. Decide whether to 'respond' immediately or 'ignore' it and wait for your current action to finish. Be conservative and only respond when necessary, like when you need to change/stop your action, or convey necessary information. Example 1: You:Building a house! !newAction('Build a house.').\nOther Bot: 'Come here!'\nYour decision: ignore\nExample 2: You:Collecting dirt !collectBlocks('dirt',10).\nOther Bot: 'No, collect some wood instead.'\nYour decision: respond\nExample 3: You:Coming to you now. !goToPlayer('billy',3).\nOther Bot: 'What biome are you in?'\nYour decision: respond\nActual Conversation: $TO_SUMMARIZE\nDecide by outputting ONLY 'respond' or 'ignore', nothing else. Your decision:",
 
-    "image_conversing": "Summarize the visible Minecraft screen by listing the types and arrangements of blocks, focusing on terrain, structures, and notable features in 500 chars.",
+    "image_conversing": "Summarize the visible Minecraft screen by listing the types and arrangements of blocks, focusing on terrain, structures, and notable features in 400 chars.",
 
     "modes": {
         "self_preservation": true,
diff --git a/src/agent/prompter.js b/src/agent/prompter.js
index 1f46492..2668ad7 100644
--- a/src/agent/prompter.js
+++ b/src/agent/prompter.js
@@ -264,29 +264,6 @@ export class Prompter {
         return '';
     }
 
-    async promptImageConvo(messages, imageBuffer) {
-        await this.checkCooldown();
-        let prompt = this.profile.image_conversing;
-        let imageMessages = [...messages];
-        imageMessages.push({
-            role: "user", 
-            content: [
-                { type: "text", text: prompt },
-                {
-                    type: "image_url",
-                    image_url: {
-                        "url": `data:image/jpeg;base64,${imageBuffer.toString('base64')}`,
-                    }
-                }
-            ]
-        });
-
-        return await this.chat_model.sendRequest(
-            imageMessages,
-            prompt
-        );
-    }
-
     async promptCoding(messages) {
         if (this.awaiting_coding) {
             console.warn('Already awaiting coding response, returning no response.');
diff --git a/src/agent/vision_interpreter.js b/src/agent/vision_interpreter.js
index deea836..38b7a44 100644
--- a/src/agent/vision_interpreter.js
+++ b/src/agent/vision_interpreter.js
@@ -37,7 +37,7 @@ export class VisionInterpreter {
             filename = await camera.capture();
         }
 
-        if (!this.allow_vision) {
+        if (!this.allow_vision || !this.agent.prompter.chat_model.sendVisionRequest) {
             log(this.agent.bot, "Vision is disabled. Using text-based environment description instead.");
             log(this.agent.bot, this._nearbyBlocks());
         } else {
@@ -54,7 +54,7 @@ export class VisionInterpreter {
 
         let filename = await camera.capture();
 
-        if (!this.allow_vision) {
+        if (!this.allow_vision || !this.agent.prompter.chat_model.sendVisionRequest) {
             log(this.agent.bot, "Vision is disabled. Using text-based environment description instead.");
             log(this.agent.bot, this._nearbyBlocks());
         } else {
@@ -63,12 +63,14 @@ export class VisionInterpreter {
     }
 
     async analyzeImage(filename) {
+        let prompt = this.agent.prompter.profile.image_conversing;
         let res = null;
+
         try {
             const bot = this.agent.bot;
             const imageBuffer = fs.readFileSync(`${this.fp}/${filename}.jpg`);
             const messages = this.agent.history.getHistory();
-            res = await this.agent.prompter.promptImageConvo(messages, imageBuffer);
+            res = await this.agent.prompter.chat_model.sendVisionRequest(messages, prompt, imageBuffer);
             log(bot, res);
         } catch (error) {
             log(this.agent.bot, `Error analyzing image: ${error.message}`);
diff --git a/src/models/gpt.js b/src/models/gpt.js
index da29ef1..6664409 100644
--- a/src/models/gpt.js
+++ b/src/models/gpt.js
@@ -53,6 +53,24 @@ export class GPT {
         return res;
     }
 
+    async sendVisionRequest(messages, systemMessage, imageBuffer) {
+        const imageMessages = [...messages];
+        imageMessages.push({
+            role: "user",
+            content: [
+                { type: "text", text: systemMessage },
+                {
+                    type: "image_url",
+                    image_url: {
+                        url: `data:image/jpeg;base64,${imageBuffer.toString('base64')}`
+                    }
+                }
+            ]
+        });
+        
+        return this.sendRequest(imageMessages, systemMessage);
+    }
+
     async embed(text) {
         const embedding = await this.openai.embeddings.create({
             model: this.model_name || "text-embedding-3-small",
@@ -61,6 +79,7 @@ export class GPT {
         });
         return embedding.data[0].embedding;
     }
+
 }
 
 
diff --git a/src/models/mistral.js b/src/models/mistral.js
index 3b41f78..124ee9a 100644
--- a/src/models/mistral.js
+++ b/src/models/mistral.js
@@ -61,6 +61,22 @@ export class Mistral {
         return result;
     }
 
+    async sendVisionRequest(messages, systemMessage, imageBuffer) {
+        const imageMessages = [...messages];
+        imageMessages.push({
+            role: "user",
+            content: [
+                { type: "text", text: systemMessage },
+                {
+                    type: "image_url",
+                    imageUrl: `data:image/jpeg;base64,${imageBuffer.toString('base64')}`
+                }
+            ]
+        });
+        
+        return this.sendRequest(imageMessages, systemMessage);
+    }
+
     async embed(text) {
         const embedding = await this.#client.embeddings.create({
             model: "mistral-embed",
diff --git a/src/utils/text.js b/src/utils/text.js
index 1e93667..363061f 100644
--- a/src/utils/text.js
+++ b/src/utils/text.js
@@ -33,7 +33,9 @@ export function strictFormat(turns) {
     let messages = [];
     let filler = {role: 'user', content: '_'};
     for (let msg of turns) {
-        msg.content = msg.content.trim();
+        if (typeof msg.content === 'string')  {
+            msg.content = msg.content.trim();
+        }
         if (msg.role === 'system') {
             msg.role = 'user';
             msg.content = 'SYSTEM: ' + msg.content;