Merge pull request #471 from kolbytn/vision

Vision
2025-07-29 03:15:27 +02:00 · 2025-03-16 19:18:25 -05:00 · 2025-03-16 19:18:25 -05:00 · 5bafdfddf6
commit 5bafdfddf6
parent b27c69a2b6 2c1ff9e77d
19 changed files with 454 additions and 99 deletions
--- a/README.md
+++ b/README.md
@ -119,6 +119,11 @@ You can pass a string or an object for these fields. A model object must specify
  "model": "gpt-4",
  "url": "https://api.openai.com/v1/"
 },
+"vision_model": {
+  "api": "openai",
+  "model": "gpt-4o",
+  "url": "https://api.openai.com/v1/"
+},
 "embedding": {
  "api": "openai",
  "url": "https://api.openai.com/v1/",
@ -127,7 +132,7 @@ You can pass a string or an object for these fields. A model object must specify

 ```

-`model` is used for chat, `code_model` is used for newAction coding, and `embedding` is used to embed text for example selection. If `code_model` or `embedding` are not specified, they will use `model` by default. Not all APIs have an embedding model.
+`model` is used for chat, `code_model` is used for newAction coding, `vision_model` is used for image interpretation, and `embedding` is used to embed text for example selection. If `code_model` or `vision_model` is not specified, `model` will be used by default. Not all APIs support embeddings or vision.

 All apis have default models and urls, so those fields are optional. The `params` field is optional and can be used to specify additional parameters for the model. It accepts any key-value pairs supported by the api. Is not supported for embedding models.

--- a/package.json
+++ b/package.json
@ -11,12 +11,13 @@
        "google-translate-api-x": "^10.7.1",
        "groq-sdk": "^0.15.0",
        "minecraft-data": "^3.78.0",
-        "mineflayer": "^4.23.0",
+        "mineflayer": "^4.26.0",
        "mineflayer-armor-manager": "^2.0.1",
        "mineflayer-auto-eat": "^3.3.6",
        "mineflayer-collectblock": "^1.4.1",
        "mineflayer-pathfinder": "^2.4.5",
        "mineflayer-pvp": "^1.3.2",
+        "node-canvas-webgl": "PrismarineJS/node-canvas-webgl",
        "openai": "^4.4.0",
        "patch-package": "^8.0.0",
        "prismarine-item": "^1.15.0",
@ -25,6 +26,7 @@
        "ses": "^1.9.1",
        "socket.io": "^4.7.2",
        "socket.io-client": "^4.7.2",
+        "three": "^0.128.0",
        "vec3": "^0.1.10",
        "yargs": "^17.7.2"
    },
--- a/patches/prismarine-viewer+1.33.0.patch
+++ b/patches/prismarine-viewer+1.33.0.patch
@ -0,0 +1,13 @@
+diff --git a/node_modules/prismarine-viewer/viewer/lib/entity/Entity.js b/node_modules/prismarine-viewer/viewer/lib/entity/Entity.js
+index 8945452..dab25be 100644
+--- a/node_modules/prismarine-viewer/viewer/lib/entity/Entity.js
+++ b/node_modules/prismarine-viewer/viewer/lib/entity/Entity.js
+@@ -203,7 +203,7 @@ function getMesh (texture, jsonModel) {
+ class Entity {
+   constructor (version, type, scene) {
+     const e = entities[type]
+-    if (!e) throw new Error(`Unknown entity ${type}`)
+    if (!e) return; //throw new Error(`Unknown entity ${type}`)
+ 
+     this.mesh = new THREE.Object3D()
+     for (const [name, jsonModel] of Object.entries(e.geometry)) {
--- a/profiles/defaults/_default.json
+++ b/profiles/defaults/_default.json
@ -9,6 +9,8 @@
    
    "bot_responder": "You are a minecraft bot named $NAME that is currently in conversation with another AI bot. Both of you can take actions with the !command syntax, and actions take time to complete. You are currently busy with the following action: '$ACTION' but have received a new message. Decide whether to 'respond' immediately or 'ignore' it and wait for your current action to finish. Be conservative and only respond when necessary, like when you need to change/stop your action, or convey necessary information. Example 1: You:Building a house! !newAction('Build a house.').\nOther Bot: 'Come here!'\nYour decision: ignore\nExample 2: You:Collecting dirt !collectBlocks('dirt',10).\nOther Bot: 'No, collect some wood instead.'\nYour decision: respond\nExample 3: You:Coming to you now. !goToPlayer('billy',3).\nOther Bot: 'What biome are you in?'\nYour decision: respond\nActual Conversation: $TO_SUMMARIZE\nDecide by outputting ONLY 'respond' or 'ignore', nothing else. Your decision:",

+    "image_analysis": "You are a Minecraft bot named $NAME that has been given a screenshot of your current view. Analyze and summarize the view; describe terrain, blocks, entities, structures, and notable features. Focus on details relevant to the conversation. Note: the sky is always blue regardless of weather or time, dropped items are small pink cubes, and blocks below y=0 do not render. Be extremely concise and correct, respond only with your analysis, not conversationally. $STATS",
+
    "modes": {
        "self_preservation": true,
        "unstuck": true,
@ -65,14 +67,20 @@
            {"role": "user", "content": "derg: (FROM OTHER BOT)Thanks, see you later!"},
            {"role": "assistant", "content": "See you later. !endConversation(\"derg\")"}
        ],
-    
+
        [
-            {"role": "user", "content": "grombo_Xx: What do you see?"},
-            {"role": "assistant", "content": "Let me see... !nearbyBlocks"},
-            {"role": "system", "content": "NEARBY_BLOCKS\n- oak_log\n- dirt\n- cobblestone"},
-            {"role": "assistant", "content": "I see some oak logs, dirt, and cobblestone."}
+            {"role": "user", "content": "d7297: look here"},
+            {"role": "assistant", "content": "Okay! !lookAtPlayer(\"d7297\", \"at\")"},
+            {"role": "system", "content": "Image analysis:\nLooking at player d7297. There is a giant waterfall behind d7297."},
+            {"role": "assistant", "content": "Woo, that is a cool waterfall!"}
        ],
-    
+        [
+            {"role": "user", "content": "d7297: look 0 70 12 and describe what is there"},
+            {"role": "assistant", "content": "Sure! !lookAtPosition(0, 70, 12)"},
+            {"role": "system", "content": "Image analysis:\nLooking at position 0, 70, 12. There is a water fountain."},
+            {"role": "assistant", "content": "I see the water fountain! amazing!"}
+        ],
+
        [
            {"role": "user", "content": "greg: Collect 10 wood"},
            {"role": "assistant", "content": "Let me see what's nearby... !nearbyBlocks"},
--- a/settings.js
+++ b/settings.js
@ -1,6 +1,6 @@
 export default 
 {
-    "minecraft_version": "1.20.4", // supports up to 1.21.1
+    "minecraft_version": "1.21.1", // supports up to 1.21.1
    "host": "127.0.0.1", // or "localhost", "your.ip.address.here"
    "port": process.env.MINECRAFT_PORT || 55916,
    "auth": "offline", // or "microsoft"
@ -34,6 +34,7 @@ export default
    "show_bot_views": false, // show bot's view in browser at localhost:3000, 3001...

    "allow_insecure_coding": false, // allows newAction command and model can write/run code on your computer. enable at own risk
+    "allow_vision": false, // allows vision model to interpret screenshots as inputs
    "blocked_actions" : [], // commands to disable and remove from docs. Ex: ["!setMode"]
    "code_timeout_mins": -1, // minutes code is allowed to run. -1 for no timeout
    "relevant_docs_count": 5, // number of relevant code function docs to select for prompting. -1 for all
--- a/src/agent/agent.js
+++ b/src/agent/agent.js
@ -1,5 +1,6 @@
 import { History } from './history.js';
 import { Coder } from './coder.js';
+import { VisionInterpreter } from './vision/vision_interpreter.js';
 import { Prompter } from '../models/prompter.js';
 import { initModes } from './modes.js';
 import { initBot } from '../utils/mcdata.js';
@ -10,7 +11,7 @@ import { MemoryBank } from './memory_bank.js';
 import { SelfPrompter } from './self_prompter.js';
 import convoManager from './conversation.js';
 import { handleTranslation, handleEnglishTranslation } from '../utils/translator.js';
-import { addViewer } from './viewer.js';
+import { addBrowserViewer } from './vision/browser_viewer.js';
 import settings from '../../settings.js';
 import { serverProxy } from './agent_proxy.js';
 import { Task } from './tasks.js';
@ -80,7 +81,7 @@ export class Agent {
        this.bot.once('spawn', async () => {
            try {
                clearTimeout(spawnTimeout);
-                addViewer(this.bot, count_id);
+                addBrowserViewer(this.bot, count_id);

                // wait for a bit so stats are not undefined
                await new Promise((resolve) => setTimeout(resolve, 1000));
@ -95,6 +96,9 @@ export class Agent {
                    this.task.initBotTask();
                }

+                console.log('Initializing vision intepreter...');
+                this.vision_interpreter = new VisionInterpreter(this, settings.allow_vision);
+
            } catch (error) {
                console.error('Error in spawn event:', error);
                process.exit(0);
--- a/src/agent/commands/actions.js
+++ b/src/agent/commands/actions.js
@ -407,6 +407,45 @@ export const actionsList = [
            return `Converstaion with ${player_name} ended.`;
        }
    },
+    {
+        name: '!lookAtPlayer',
+        description: 'Look at a player or look in the same direction as the player.',
+        params: {
+            'player_name': { type: 'string', description: 'Name of the target player' },
+            'direction': {
+                type: 'string',
+                description: 'How to look ("at": look at the player, "with": look in the same direction as the player)',
+            }
+        },
+        perform: async function(agent, player_name, direction) {
+            if (direction !== 'at' && direction !== 'with') {
+                return "Invalid direction. Use 'at' or 'with'.";
+            }
+            let result = "";
+            const actionFn = async () => {
+                result = await agent.vision_interpreter.lookAtPlayer(player_name, direction);
+            };
+            await agent.actions.runAction('action:lookAtPlayer', actionFn);
+            return result;
+        }
+    },
+    {
+        name: '!lookAtPosition',
+        description: 'Look at specified coordinates.',
+        params: {
+            'x': { type: 'int', description: 'x coordinate' },
+            'y': { type: 'int', description: 'y coordinate' },
+            'z': { type: 'int', description: 'z coordinate' }
+        },
+        perform: async function(agent, x, y, z) {
+            let result = "";
+            const actionFn = async () => {
+                result = await agent.vision_interpreter.lookAtPosition(x, y, z);
+            };
+            await agent.actions.runAction('action:lookAtPosition', actionFn);
+            return result;
+        }
+    },
    {
        name: '!digDown',
        description: 'Digs down a specified distance. Will stop if it reaches lava, water, or a fall of >=4 blocks below the bot.',
@ -415,17 +454,4 @@ export const actionsList = [
            await skills.digDown(agent.bot, distance)
        })
    },
-    // { // commented for now, causes confusion with goal command
-    //     name: '!npcGoal',
-    //     description: 'Set a simple goal for an item or building to automatically work towards. Do not use for complex goals.',
-    //     params: {
-    //         'name': { type: 'string', description: 'The name of the goal to set. Can be item or building name. If empty will automatically choose a goal.' },
-    //         'quantity': { type: 'int', description: 'The quantity of the goal to set. Default is 1.', domain: [1, Number.MAX_SAFE_INTEGER] }
-    //     },
-    //     perform: async function (agent, name=null, quantity=1) {
-    //         await agent.npc.setGoal(name, quantity);
-    //         agent.bot.emit('idle');  // to trigger the goal
-    //         return 'Set npc goal: ' + agent.npc.data.curr_goal.name;
-    //     }
-    // },
 ];
--- a/src/agent/library/skills.js
+++ b/src/agent/library/skills.js
@ -1399,7 +1399,6 @@ export async function activateNearestBlock(bot, type) {
    return true;
 }

-
 export async function digDown(bot, distance = 10) {
    /**
     * Digs down a specified distance. Will stop if it reaches lava, water, or a fall of >=4 blocks below the bot.
--- a/src/agent/vision/browser_viewer.js
+++ b/src/agent/vision/browser_viewer.js
@ -1,8 +1,8 @@
-import settings from '../../settings.js';
+import settings from '../../../settings.js';
 import prismarineViewer from 'prismarine-viewer';
 const mineflayerViewer = prismarineViewer.mineflayer;

-export function addViewer(bot, count_id) {
+export function addBrowserViewer(bot, count_id) {
    if (settings.show_bot_views)
        mineflayerViewer(bot, { port: 3000+count_id, firstPerson: true, });
 }
--- a/src/agent/vision/camera.js
+++ b/src/agent/vision/camera.js
@ -0,0 +1,78 @@
+import { Viewer } from 'prismarine-viewer/viewer/lib/viewer.js';
+import { WorldView } from 'prismarine-viewer/viewer/lib/worldview.js';
+import { getBufferFromStream } from 'prismarine-viewer/viewer/lib/simpleUtils.js';
+
+import THREE from 'three';
+import { createCanvas } from 'node-canvas-webgl/lib/index.js';
+import fs from 'fs/promises';
+import { Vec3 } from 'vec3';
+import { EventEmitter } from 'events';
+
+import worker_threads from 'worker_threads';
+global.Worker = worker_threads.Worker;
+
+
+export class Camera extends EventEmitter {
+    constructor (bot, fp) {
+        super();
+        this.bot = bot;
+        this.fp = fp;
+        this.viewDistance = 12;
+        this.width = 800;
+        this.height = 512;
+        this.canvas = createCanvas(this.width, this.height);
+        this.renderer = new THREE.WebGLRenderer({ canvas: this.canvas });
+        this.viewer = new Viewer(this.renderer);
+        this._init().then(() => {
+            this.emit('ready');
+        })
+    }
+  
+    async _init () {
+        const botPos = this.bot.entity.position;
+        const center = new Vec3(botPos.x, botPos.y+this.bot.entity.height, botPos.z);
+        this.viewer.setVersion(this.bot.version);
+        // Load world
+        const worldView = new WorldView(this.bot.world, this.viewDistance, center);
+        this.viewer.listen(worldView);
+        worldView.listenToBot(this.bot);
+        await worldView.init(center);
+        this.worldView = worldView;
+    }
+  
+    async capture() {
+        const center = new Vec3(this.bot.entity.position.x, this.bot.entity.position.y+this.bot.entity.height, this.bot.entity.position.z);
+        this.viewer.camera.position.set(center.x, center.y, center.z);
+        await this.worldView.updatePosition(center);
+        this.viewer.setFirstPersonCamera(this.bot.entity.position, this.bot.entity.yaw, this.bot.entity.pitch);
+        this.viewer.update();
+        this.renderer.render(this.viewer.scene, this.viewer.camera);
+
+        const imageStream = this.canvas.createJPEGStream({
+            bufsize: 4096,
+            quality: 100,
+            progressive: false
+        });
+        
+        const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
+        const filename = `screenshot_${timestamp}`;
+
+        const buf = await getBufferFromStream(imageStream);
+        await this._ensureScreenshotDirectory();
+        await fs.writeFile(`${this.fp}/${filename}.jpg`, buf);
+        console.log('saved', filename);
+        return filename;
+    }
+
+    async _ensureScreenshotDirectory() {
+        let stats;
+        try {
+            stats = await fs.stat(this.fp);
+        } catch (e) {
+            if (!stats?.isDirectory()) {
+                await fs.mkdir(this.fp);
+            }
+        }
+    }
+}
+  
--- a/src/agent/vision/vision_interpreter.js
+++ b/src/agent/vision/vision_interpreter.js
@ -0,0 +1,81 @@
+import { Vec3 } from 'vec3';
+import { Camera } from "./camera.js";
+import fs from 'fs';
+
+export class VisionInterpreter {
+    constructor(agent, allow_vision) {
+        this.agent = agent;
+        this.allow_vision = allow_vision;
+        this.fp = './bots/'+agent.name+'/screenshots/';
+        if (allow_vision) {
+            this.camera = new Camera(agent.bot, this.fp);
+        }
+    }
+
+    async lookAtPlayer(player_name, direction) {
+        if (!this.allow_vision || !this.agent.prompter.vision_model.sendVisionRequest) {
+            return "Vision is disabled. Use other methods to describe the environment.";
+        }
+        let result = "";
+        const bot = this.agent.bot;
+        const player = bot.players[player_name]?.entity;
+        if (!player) {
+            return `Could not find player ${player_name}`;
+        }
+
+        let filename;
+        if (direction === 'with') {
+            await bot.look(player.yaw, player.pitch);
+            result = `Looking in the same direction as ${player_name}\n`;
+            filename = await this.camera.capture();
+        } else {
+            await bot.lookAt(new Vec3(player.position.x, player.position.y + player.height, player.position.z));
+            result = `Looking at player ${player_name}\n`;
+            filename = await this.camera.capture();
+
+        }
+
+        return result + `Image analysis: "${await this.analyzeImage(filename)}"`;
+    }
+
+    async lookAtPosition(x, y, z) {
+        if (!this.allow_vision || !this.agent.prompter.vision_model.sendVisionRequest) {
+            return "Vision is disabled. Use other methods to describe the environment.";
+        }
+        let result = "";
+        const bot = this.agent.bot;
+        await bot.lookAt(new Vec3(x, y + 2, z));
+        result = `Looking at coordinate ${x}, ${y}, ${z}\n`;
+
+        let filename = await this.camera.capture();
+
+        return result + `Image analysis: "${await this.analyzeImage(filename)}"`;
+    }
+
+    getCenterBlockInfo() {
+        const bot = this.agent.bot;
+        const maxDistance = 128; // Maximum distance to check for blocks
+        const targetBlock = bot.blockAtCursor(maxDistance);
+        
+        if (targetBlock) {
+            return `Block at center view: ${targetBlock.name} at (${targetBlock.position.x}, ${targetBlock.position.y}, ${targetBlock.position.z})`;
+        } else {
+            return "No block in center view";
+        }
+    }
+
+    async analyzeImage(filename) {
+        try {
+            const imageBuffer = fs.readFileSync(`${this.fp}/${filename}.jpg`);
+            const messages = this.agent.history.getHistory();
+
+            const blockInfo = this.getCenterBlockInfo();
+            const result = await this.agent.prompter.promptVision(messages, imageBuffer);
+            return result + `\n${blockInfo}`;
+
+        } catch (error) {
+            console.warn('Error reading image:', error);
+            return `Error reading image: ${error.message}`;
+        }
+    }
+} 
--- a/src/models/claude.js
+++ b/src/models/claude.js
@ -47,16 +47,40 @@ export class Claude {
            }
        }
        catch (err) {
+            if (err.message.includes("does not support image input")) {
+                res = "Vision is only supported by certain models.";
+            } else {
+                res = "My brain disconnected, try again.";
+            }
            console.log(err);
-            res = 'My brain disconnected, try again.';
        }
        return res;
    }

+    async sendVisionRequest(turns, systemMessage, imageBuffer) {
+        const imageMessages = [...turns];
+        imageMessages.push({
+            role: "user",
+            content: [
+                {
+                    type: "text",
+                    text: systemMessage
+                },
+                {
+                    type: "image",
+                    source: {
+                        type: "base64",
+                        media_type: "image/jpeg",
+                        data: imageBuffer.toString('base64')
+                    }
+                }
+            ]
+        });
+
+        return this.sendRequest(imageMessages, systemMessage);
+    }
+
    async embed(text) {
        throw new Error('Embeddings are not supported by Claude.');
    }
 }
-
-
-
--- a/src/models/gemini.js
+++ b/src/models/gemini.js
@ -96,6 +96,51 @@ export class Gemini {
        return text;
    }

+    async sendVisionRequest(turns, systemMessage, imageBuffer) {
+        let model;
+        if (this.url) {
+            model = this.genAI.getGenerativeModel(
+                { model: this.model_name || "gemini-1.5-flash" },
+                { baseUrl: this.url },
+                { safetySettings: this.safetySettings }
+            );
+        } else {
+            model = this.genAI.getGenerativeModel(
+                { model: this.model_name || "gemini-1.5-flash" },
+                { safetySettings: this.safetySettings }
+            );
+        }
+
+        const imagePart = {
+            inlineData: {
+                data: imageBuffer.toString('base64'),
+                mimeType: 'image/jpeg'
+            }
+        };
+
+        const stop_seq = '***';
+        const prompt = toSinglePrompt(turns, systemMessage, stop_seq, 'model');
+        let res = null;
+        try {
+            console.log('Awaiting Google API vision response...');
+            const result = await model.generateContent([prompt, imagePart]);
+            const response = await result.response;
+            const text = response.text();
+            console.log('Received.');
+            if (!text.includes(stop_seq)) return text;
+            const idx = text.indexOf(stop_seq);
+            res = text.slice(0, idx);
+        } catch (err) {
+            console.log(err);
+            if (err.message.includes("Image input modality is not enabled for models/")) {
+                res = "Vision is only supported by certain models.";
+            } else {
+                res = "An unexpected error occurred, please try again.";
+            }
+        }
+        return res;
+    }
+
    async embed(text) {
        let model;
        if (this.url) {
--- a/src/models/gpt.js
+++ b/src/models/gpt.js
@ -48,6 +48,9 @@ export class GPT {
            if ((err.message == 'Context length exceeded' || err.code == 'context_length_exceeded') && turns.length > 1) {
                console.log('Context length exceeded, trying again with shorter context.');
                return await this.sendRequest(turns.slice(1), systemMessage, stop_seq);
+            } else if (err.message.includes('image_url')) {
+                console.log(err);
+                res = 'Vision is only supported by certain models.';
            } else {
                console.log(err);
                res = 'My brain disconnected, try again.';
@ -56,6 +59,24 @@ export class GPT {
        return res;
    }

+    async sendVisionRequest(messages, systemMessage, imageBuffer) {
+        const imageMessages = [...messages];
+        imageMessages.push({
+            role: "user",
+            content: [
+                { type: "text", text: systemMessage },
+                {
+                    type: "image_url",
+                    image_url: {
+                        url: `data:image/jpeg;base64,${imageBuffer.toString('base64')}`
+                    }
+                }
+            ]
+        });
+        
+        return this.sendRequest(imageMessages, systemMessage);
+    }
+
    async embed(text) {
        if (text.length > 8191)
            text = text.slice(0, 8191);
@ -66,6 +87,7 @@ export class GPT {
        });
        return embedding.data[0].embedding;
    }
+
 }


--- a/src/models/grok.js
+++ b/src/models/grok.js
@ -43,6 +43,9 @@ export class Grok {
            if ((err.message == 'Context length exceeded' || err.code == 'context_length_exceeded') && turns.length > 1) {
                console.log('Context length exceeded, trying again with shorter context.');
                return await this.sendRequest(turns.slice(1), systemMessage, stop_seq);
+            } else if (err.message.includes('The model expects a single `text` element per message.')) {
+                console.log(err);
+                res = 'Vision is only supported by certain models.';
            } else {
                console.log(err);
                res = 'My brain disconnected, try again.';
@ -51,6 +54,24 @@ export class Grok {
        // sometimes outputs special token <|separator|>, just replace it
        return res.replace(/<\|separator\|>/g, '*no response*');
    }
+
+    async sendVisionRequest(messages, systemMessage, imageBuffer) {
+        const imageMessages = [...messages];
+        imageMessages.push({
+            role: "user",
+            content: [
+                { type: "text", text: systemMessage },
+                {
+                    type: "image_url",
+                    image_url: {
+                        url: `data:image/jpeg;base64,${imageBuffer.toString('base64')}`
+                    }
+                }
+            ]
+        });
+        
+        return this.sendRequest(imageMessages, systemMessage);
+    }
    
    async embed(text) {
        throw new Error('Embeddings are not supported by Grok.');
--- a/src/models/groq.js
+++ b/src/models/groq.js
@ -27,80 +27,69 @@ export class GroqCloudAPI {

    }

- async sendRequest(turns, systemMessage, stop_seq = null) {
-  // Variables for DeepSeek-R1 models
-  const maxAttempts = 5;
-  let attempt = 0;
-  let finalRes = null;
-  let res = null;
+    async sendRequest(turns, systemMessage, stop_seq = null) {
+        // Construct messages array
+        let messages = [{"role": "system", "content": systemMessage}].concat(turns);

-  // Construct messages array
-  let messages = [{"role": "system", "content": systemMessage}].concat(turns);
+        let res = null;

-  while (attempt < maxAttempts) {
-    attempt++;
+        try {
+            console.log("Awaiting Groq response...");

-    // These variables look odd, but they're for the future.
-    let raw_res = null;
-    let tool_calls = null;
+            // Handle deprecated max_tokens parameter
+            if (this.params.max_tokens) {
+                console.warn("GROQCLOUD WARNING: A profile is using `max_tokens`. This is deprecated. Please move to `max_completion_tokens`.");
+                this.params.max_completion_tokens = this.params.max_tokens;
+                delete this.params.max_tokens;
+            }

-    try {
-      console.log("Awaiting Groq response...");
+            if (!this.params.max_completion_tokens) {
+                this.params.max_completion_tokens = 4000;
+            }

-      // Handle deprecated max_tokens parameter
-      if (this.params.max_tokens) {
-        console.warn("GROQCLOUD WARNING: A profile is using `max_tokens`. This is deprecated. Please move to `max_completion_tokens`.");
-        this.params.max_completion_tokens = this.params.max_tokens;
-        delete this.params.max_tokens;
-      }
+            let completion = await this.groq.chat.completions.create({
+                "messages": messages,
+                "model": this.model_name || "llama-3.3-70b-versatile",
+                "stream": false,
+                "stop": stop_seq,
+                ...(this.params || {})
+            });

-      if (!this.params.max_completion_tokens) {
-        this.params.max_completion_tokens = 8000; // Set it lower.
-      }
+            res = completion.choices[0].message;

-      let completion = await this.groq.chat.completions.create({
-        "messages": messages,
-        "model": this.model_name || "llama-3.3-70b-versatile",
-        "stream": false,
-        "stop": stop_seq,
-        ...(this.params || {})
-      });
-
-      raw_res = completion.choices[0].message;
-      res = raw_res.content;
-    } catch (err) {
-      console.log(err);
-      res = "My brain just kinda stopped working. Try again.";
+            res = res.replace(/<think>[\s\S]*?<\/think>/g, '').trim();
+        }
+        catch(err) {
+            if (err.message.includes("content must be a string")) {
+                res = "Vision is only supported by certain models.";
+            } else {
+                console.log(this.model_name);
+                res = "My brain disconnected, try again.";
+            }
+            console.log(err);
+        }
+        return res;
    }

-    // Check for <think> tag issues
-    const hasOpenTag = res.includes("<think>");
-    const hasCloseTag = res.includes("</think>");
-
-    // If a partial <think> block is detected, log a warning and retry
-    if (hasOpenTag && !hasCloseTag) {
-      console.warn("Partial <think> block detected. Re-generating Groq request...");
-      continue; // This will skip the rest of the loop and try again
+    async sendVisionRequest(messages, systemMessage, imageBuffer) {
+        const imageMessages = messages.filter(message => message.role !== 'system');
+        imageMessages.push({
+            role: "user",
+            content: [
+                { type: "text", text: systemMessage },
+                {
+                    type: "image_url",
+                    image_url: {
+                        url: `data:image/jpeg;base64,${imageBuffer.toString('base64')}`
+                    }
+                }
+            ]
+        });
+        
+        return this.sendRequest(imageMessages);
    }

-    // If only the closing tag is present, prepend an opening tag
-    if (hasCloseTag && !hasOpenTag) {
-      res = '<think>' + res;
+    async embed(_) {
+        throw new Error('Embeddings are not supported by Groq.');
    }
-    
-    // Remove the complete <think> block (and any content inside) from the response
-    res = res.replace(/<think>[\s\S]*?<\/think>/g, '').trim();
-
-    finalRes = res;
-    break; // Exit the loop once a valid response is obtained
-  }
-
-  if (finalRes == null) {
-    console.warn("Could not obtain a valid <think> block or normal response after max attempts.");
-    finalRes = "I thought too hard, sorry, try again.";
-  }
-
-  finalRes = finalRes.replace(/<\|separator\|>/g, '*no response*');
-  return finalRes;
-  }
 }
--- a/src/models/mistral.js
+++ b/src/models/mistral.js
@ -47,6 +47,7 @@ export class Mistral {
            ];
            messages.push(...strictFormat(turns));

+            console.log('Awaiting mistral api response...')
            const response  = await this.#client.chat.complete({
                model,
                messages,
@ -55,14 +56,33 @@ export class Mistral {

            result = response.choices[0].message.content;
        } catch (err) {
-            console.log(err)
-
-            result = "My brain disconnected, try again.";
+            if (err.message.includes("A request containing images has been given to a model which does not have the 'vision' capability.")) {
+                result = "Vision is only supported by certain models.";
+            } else {
+                result = "My brain disconnected, try again.";
+            }
+            console.log(err);
        }

        return result;
    }

+    async sendVisionRequest(messages, systemMessage, imageBuffer) {
+        const imageMessages = [...messages];
+        imageMessages.push({
+            role: "user",
+            content: [
+                { type: "text", text: systemMessage },
+                {
+                    type: "image_url",
+                    imageUrl: `data:image/jpeg;base64,${imageBuffer.toString('base64')}`
+                }
+            ]
+        });
+        
+        return this.sendRequest(imageMessages, systemMessage);
+    }
+
    async embed(text) {
        const embedding = await this.#client.embeddings.create({
            model: "mistral-embed",
--- a/src/models/prompter.js
+++ b/src/models/prompter.js
@ -66,6 +66,14 @@ export class Prompter {
            this.code_model = this.chat_model;
        }

+        if (this.profile.vision_model) {
+            let vision_model_profile = this._selectAPI(this.profile.vision_model);
+            this.vision_model = this._createModel(vision_model_profile);
+        }
+        else {
+            this.vision_model = this.chat_model;
+        }
+
        let embedding = this.profile.embedding;
        if (embedding === undefined) {
            if (chat_model_profile.api !== 'ollama')
@ -359,6 +367,13 @@ export class Prompter {
        return res.trim().toLowerCase() === 'respond';
    }

+    async promptVision(messages, imageBuffer) {
+        await this.checkCooldown();
+        let prompt = this.profile.image_analysis;
+        prompt = await this.replaceStrings(prompt, messages, null, null, null);
+        return await this.vision_model.sendVisionRequest(messages, prompt, imageBuffer);
+    }
+
    async promptGoalSetting(messages, last_goals) {
        let system_message = this.profile.goal_setting;
        system_message = await this.replaceStrings(system_message, messages);
--- a/src/utils/text.js
+++ b/src/utils/text.js
@ -46,7 +46,9 @@ export function strictFormat(turns) {
    let messages = [];
    let filler = {role: 'user', content: '_'};
    for (let msg of turns) {
-        msg.content = msg.content.trim();
+        if (typeof msg.content === 'string')  {
+            msg.content = msg.content.trim();
+        }
        if (msg.role === 'system') {
            msg.role = 'user';
            msg.content = 'SYSTEM: ' + msg.content;