Merge pull request #3 from Jules' work

Jules wip 2192516976139170352
2025-07-01 06:05:19 +02:00 · 2025-06-07 02:33:05 -07:00 · 2025-06-07 02:33:05 -07:00 · 0db80cfc56
commit 0db80cfc56
parent 21481a7861 be38f56f12
23 changed files with 651 additions and 254 deletions
--- a/settings.js
+++ b/settings.js
@ -35,6 +35,7 @@ const settings = {

    "allow_insecure_coding": false, // allows newAction command and model can write/run code on your computer. enable at own risk
    "allow_vision": false, // allows vision model to interpret screenshots as inputs
+    "vision_mode": "prompted", // "off", "prompted", or "always"
    "blocked_actions" : ["!checkBlueprint", "!checkBlueprintLevel", "!getBlueprint", "!getBlueprintLevel"] , // commands to disable and remove from docs. Ex: ["!setMode"]
    "code_timeout_mins": -1, // minutes code is allowed to run. -1 for no timeout
    "relevant_docs_count": 5, // number of relevant code function docs to select for prompting. -1 for all
--- a/src/agent/agent.js
+++ b/src/agent/agent.js
@ -20,6 +20,7 @@ import { say } from './speak.js';
 export class Agent {
    async start(profile_fp, load_mem=false, init_message=null, count_id=0, task_path=null, task_id=null) {
        this.last_sender = null;
+        this.latestScreenshotPath = null;
        this.count_id = count_id;
        if (!profile_fp) {
            throw new Error('No profile filepath provided');
@ -116,7 +117,7 @@ export class Agent {
                this.checkAllPlayersPresent();
              
                console.log('Initializing vision intepreter...');
-                this.vision_interpreter = new VisionInterpreter(this, settings.allow_vision);
+                this.vision_interpreter = new VisionInterpreter(this, settings.vision_mode);

            } catch (error) {
                console.error('Error in spawn event:', error);
@ -172,7 +173,8 @@ export class Agent {

        if (save_data?.self_prompt) {
            if (init_message) {
-                this.history.add('system', init_message);
+                // Assuming init_message for self_prompt loading doesn't have an image
+                await this.history.add('system', init_message, null);
            }
            await this.self_prompter.handleLoad(save_data.self_prompt, save_data.self_prompting_state);
        }
@ -246,6 +248,15 @@ export class Agent {
        const from_other_bot = convoManager.isOtherAgent(source);

        if (!self_prompt && !from_other_bot) { // from user, check for forced commands
+            if (settings.vision_mode === 'always' && this.vision_interpreter && this.vision_interpreter.camera) {
+                try {
+                    const screenshotFilename = await this.vision_interpreter.camera.capture();
+                    this.latestScreenshotPath = screenshotFilename;
+                    console.log(`[${this.name}] Captured screenshot in always_active mode: ${screenshotFilename}`);
+                } catch (error) {
+                    console.error(`[${this.name}] Error capturing screenshot in always_active mode:`, error);
+                }
+            }
            const user_command_name = containsCommand(message);
            if (user_command_name) {
                if (!commandExists(user_command_name)) {
@ -256,7 +267,16 @@ export class Agent {
                if (user_command_name === '!newAction') {
                    // all user-initiated commands are ignored by the bot except for this one
                    // add the preceding message to the history to give context for newAction
-                    this.history.add(source, message);
+                    // This is the user's message that contains the !newAction command.
+                    // If a screenshot was taken due to always, it should be associated here.
+                    let imagePathForNewActionCmd = null;
+                    if (settings.vision_mode === 'always' && this.latestScreenshotPath && !self_prompt && !from_other_bot) {
+                        imagePathForNewActionCmd = this.latestScreenshotPath;
+                    }
+                    await this.history.add(source, message, imagePathForNewActionCmd);
+                    if (imagePathForNewActionCmd) {
+                        this.latestScreenshotPath = null; // Consume path
+                    }
                }
                let execute_res = await executeCommand(this, message);
                if (execute_res) 
@ -281,11 +301,29 @@ export class Agent {
                behavior_log = '...' + behavior_log.substring(behavior_log.length - MAX_LOG);
            }
            behavior_log = 'Recent behaviors log: \n' + behavior_log;
-            await this.history.add('system', behavior_log);
+            await this.history.add('system', behavior_log, null); // Behavior log unlikely to have an image
        }

-        // Handle other user messages
-        await this.history.add(source, message);
+        // Handle other user messages (or initial system messages)
+        let imagePathForInitialMessage = null;
+        if (!self_prompt && !from_other_bot) {
+            // If it's a user message and a screenshot was auto-captured for always
+            if (settings.vision_mode === 'always' && this.latestScreenshotPath) {
+                imagePathForInitialMessage = this.latestScreenshotPath;
+            }
+        } else if (source === 'system' && this.latestScreenshotPath && message.startsWith("You died at position")) {
+            // Example: System death message might use a path if set by some (future) death-capture logic
+            // For now, this is illustrative; death messages don't set latestScreenshotPath.
+            // More relevant if a system message is a direct consequence of an action that *did* set the path.
+            // However, explicit command result handling is better for those.
+            // imagePathForInitialMessage = this.latestScreenshotPath; // Generally, system messages here won't have an image unless specific logic sets it.
+        }
+
+
+        await this.history.add(source, message, imagePathForInitialMessage);
+        if (imagePathForInitialMessage) {
+            this.latestScreenshotPath = null; // Consume the path if used
+        }
        this.history.save();

        if (!self_prompt && this.self_prompter.isActive()) // message is from user during self-prompting
@ -306,10 +344,12 @@ export class Agent {

            if (command_name) { // contains query or command
                res = truncCommandMessage(res); // everything after the command is ignored
-                this.history.add(this.name, res);
+                // Agent's own message stating the command it will execute
+                await this.history.add(this.name, res, null);
                
                if (!commandExists(command_name)) {
-                    this.history.add('system', `Command ${command_name} does not exist.`);
+                    // Agent hallucinated a command
+                    await this.history.add('system', `Command ${command_name} does not exist.`, null);
                    console.warn('Agent hallucinated command:', command_name)
                    continue;
                }
@ -333,13 +373,24 @@ export class Agent {
                console.log('Agent executed:', command_name, 'and got:', execute_res);
                used_command = true;

-                if (execute_res)
-                    this.history.add('system', execute_res);
-                else
+                if (execute_res) {
+                    let imagePathForCommandResult = null;
+                    // Vision commands (!lookAtPlayer, !lookAtPosition) set latestScreenshotPath in VisionInterpreter.
+                    // This is relevant if mode is 'on' (analysis done, path stored by VI) or 'always_active' (screenshot taken, path stored by VI).
+                    if (command_name && (command_name === '!lookAtPlayer' || command_name === '!lookAtPosition') && this.latestScreenshotPath) {
+                        imagePathForCommandResult = this.latestScreenshotPath;
+                    }
+                    await this.history.add('system', execute_res, imagePathForCommandResult);
+                    if (imagePathForCommandResult) {
+                        this.latestScreenshotPath = null; // Consume the path
+                    }
+                }
+                else { // command execution didn't return anything or failed in a way that implies loop break
                    break;
+                }
            }
-            else { // conversation response
-                this.history.add(this.name, res);
+            else { // conversation response (no command)
+                await this.history.add(this.name, res, null); // Agent's text response, no image typically
                this.routeResponse(source, res);
                break;
            }
@ -488,7 +539,8 @@ export class Agent {
    

    cleanKill(msg='Killing agent process...', code=1) {
-        this.history.add('system', msg);
+        // Assuming cleanKill messages don't have images
+        await this.history.add('system', msg, null);
        this.bot.chat(code > 1 ? 'Restarting.': 'Exiting.');
        this.history.save();
        process.exit(code);
@ -497,7 +549,8 @@ export class Agent {
        if (this.task.data) {
            let res = this.task.isDone();
            if (res) {
-                await this.history.add('system', `Task ended with score : ${res.score}`);
+                // Assuming task end messages don't have images
+                await this.history.add('system', `Task ended with score : ${res.score}`, null);
                await this.history.save();
                // await new Promise(resolve => setTimeout(resolve, 3000)); // Wait 3 second for save to complete
                console.log('Task finished:', res.message);
--- a/src/agent/commands/actions.js
+++ b/src/agent/commands/actions.js
@ -428,6 +428,13 @@ export const actionsList = [
            }
        },
        perform: async function(agent, player_name, direction) {
+            if (agent.vision_interpreter && agent.vision_interpreter.vision_mode === 'off') {
+                return "Vision commands are disabled as vision mode is 'off'.";
+            }
+            // Also check if vision_interpreter or camera is not available if mode is not 'off'
+            if (agent.vision_interpreter && !agent.vision_interpreter.camera && agent.vision_interpreter.vision_mode !== 'off') {
+                return "Camera is not available, cannot perform look command.";
+            }
            if (direction !== 'at' && direction !== 'with') {
                return "Invalid direction. Use 'at' or 'with'.";
            }
@ -448,6 +455,13 @@ export const actionsList = [
            'z': { type: 'int', description: 'z coordinate' }
        },
        perform: async function(agent, x, y, z) {
+            if (agent.vision_interpreter && agent.vision_interpreter.vision_mode === 'off') {
+                return "Vision commands are disabled as vision mode is 'off'.";
+            }
+            // Also check if vision_interpreter or camera is not available if mode is not 'off'
+            if (agent.vision_interpreter && !agent.vision_interpreter.camera && agent.vision_interpreter.vision_mode !== 'off') {
+                return "Camera is not available, cannot perform look command.";
+            }
            let result = "";
            const actionFn = async () => {
                result = await agent.vision_interpreter.lookAtPosition(x, y, z);
--- a/src/agent/history.js
+++ b/src/agent/history.js
@ -58,7 +58,7 @@ export class History {
        }
    }

-    async add(name, content) {
+    async add(name, content, imagePath = null) {
        let role = 'assistant';
        if (name === 'system') {
            role = 'system';
@ -67,7 +67,7 @@ export class History {
            role = 'user';
            content = `${name}: ${content}`;
        }
-        this.turns.push({role, content});
+        this.turns.push({role, content, imagePath});

        if (this.turns.length >= this.max_messages) {
            let chunk = this.turns.splice(0, this.summary_chunk_size);
--- a/src/agent/vision/camera.js
+++ b/src/agent/vision/camera.js
@ -60,8 +60,8 @@ export class Camera extends EventEmitter {
        const buf = await getBufferFromStream(imageStream);
        await this._ensureScreenshotDirectory();
        await fs.writeFile(`${this.fp}/${filename}.jpg`, buf);
-        console.log('saved', filename);
-        return filename;
+        console.log('saved', filename + '.jpg');
+        return filename + '.jpg';
    }

    async _ensureScreenshotDirectory() {
--- a/src/agent/vision/vision_interpreter.js
+++ b/src/agent/vision/vision_interpreter.js
@ -1,21 +1,29 @@
 import { Vec3 } from 'vec3';
 import { Camera } from "./camera.js";
 import fs from 'fs';
+import path from 'path';

 export class VisionInterpreter {
-    constructor(agent, allow_vision) {
+    constructor(agent, vision_mode) {
        this.agent = agent;
-        this.allow_vision = allow_vision;
+        this.vision_mode = vision_mode;
        this.fp = './bots/'+agent.name+'/screenshots/';
-        if (allow_vision) {
+        if (this.vision_mode !== 'off') {
            this.camera = new Camera(agent.bot, this.fp);
        }
    }

    async lookAtPlayer(player_name, direction) {
-        if (!this.allow_vision || !this.agent.prompter.vision_model.sendVisionRequest) {
+        if (this.vision_mode === 'off') {
            return "Vision is disabled. Use other methods to describe the environment.";
        }
+        if (!this.camera) {
+            return "Camera is not initialized. Vision may be set to 'off'.";
+        }
+        if (!this.agent.prompter.vision_model.sendVisionRequest && this.vision_mode === 'prompted') {
+            return "Vision requests are not enabled for the current model. Cannot analyze image.";
+        }
+
        let result = "";
        const bot = this.agent.bot;
        const player = bot.players[player_name]?.entity;
@ -26,30 +34,51 @@ export class VisionInterpreter {
        let filename;
        if (direction === 'with') {
            await bot.look(player.yaw, player.pitch);
-            result = `Looking in the same direction as ${player_name}\n`;
+            result = `Looking in the same direction as ${player_name}.\n`;
            filename = await this.camera.capture();
+            this.agent.latestScreenshotPath = filename;
        } else {
            await bot.lookAt(new Vec3(player.position.x, player.position.y + player.height, player.position.z));
-            result = `Looking at player ${player_name}\n`;
+            result = `Looking at player ${player_name}.\n`;
            filename = await this.camera.capture();
-
+            this.agent.latestScreenshotPath = filename;
        }

-        return result + `Image analysis: "${await this.analyzeImage(filename)}"`;
+        if (this.vision_mode === 'prompted') {
+            return result + `Image analysis: "${await this.analyzeImage(filename)}"`;
+        } else if (this.vision_mode === 'always') {
+            return result + "Screenshot taken and stored.";
+        }
+        // Should not be reached if vision_mode is one of the expected values
+        return "Error: Unknown vision mode.";
    }

    async lookAtPosition(x, y, z) {
-        if (!this.allow_vision || !this.agent.prompter.vision_model.sendVisionRequest) {
+        if (this.vision_mode === 'off') {
            return "Vision is disabled. Use other methods to describe the environment.";
        }
+        if (!this.camera) {
+            return "Camera is not initialized. Vision may be set to 'off'.";
+        }
+        if (!this.agent.prompter.vision_model.sendVisionRequest && this.vision_mode === 'prompted') {
+            return "Vision requests are not enabled for the current model. Cannot analyze image.";
+        }
+
        let result = "";
        const bot = this.agent.bot;
-        await bot.lookAt(new Vec3(x, y + 2, z));
-        result = `Looking at coordinate ${x}, ${y}, ${z}\n`;
+        await bot.lookAt(new Vec3(x, y + 2, z)); // lookAt requires y to be eye level, so +2 from feet
+        result = `Looking at coordinate ${x}, ${y}, ${z}.\n`;

        let filename = await this.camera.capture();
+        this.agent.latestScreenshotPath = filename;

-        return result + `Image analysis: "${await this.analyzeImage(filename)}"`;
+        if (this.vision_mode === 'prompted') {
+            return result + `Image analysis: "${await this.analyzeImage(filename)}"`;
+        } else if (this.vision_mode === 'always') {
+            return result + "Screenshot taken and stored.";
+        }
+        // Should not be reached if vision_mode is one of the expected values
+        return "Error: Unknown vision mode.";
    }

    getCenterBlockInfo() {
@ -66,7 +95,9 @@ export class VisionInterpreter {

    async analyzeImage(filename) {
        try {
-            const imageBuffer = fs.readFileSync(`${this.fp}/${filename}.jpg`);
+            // filename already includes .jpg from camera.js
+            const imageFullPath = path.join(this.fp, filename);
+            const imageBuffer = fs.readFileSync(imageFullPath);
            const messages = this.agent.history.getHistory();

            const blockInfo = this.getCenterBlockInfo();
--- a/src/models/claude.js
+++ b/src/models/claude.js
@ -14,13 +14,61 @@ export class Claude {
        config.apiKey = getKey('ANTHROPIC_API_KEY');

        this.anthropic = new Anthropic(config);
+        this.supportsRawImageInput = true;
    }

-    async sendRequest(turns, systemMessage) {
-        const messages = strictFormat(turns);
+    async sendRequest(turns, systemMessage, imageData = null) {
+        const messages = strictFormat(turns); // Ensure messages are in role/content format
        let res = null;
+
+        if (imageData) {
+            const visionModels = ["claude-3-opus-20240229", "claude-3-sonnet-20240229", "claude-3-haiku-20240307"];
+            if (!visionModels.some(vm => this.model_name.includes(vm))) {
+                console.warn(`[Claude] Warning: imageData provided for model ${this.model_name}, which is not explicitly a Claude 3 vision model. The image may be ignored or cause an error.`);
+            }
+
+            let lastUserMessageIndex = -1;
+            for (let i = messages.length - 1; i >= 0; i--) {
+                if (messages[i].role === 'user') {
+                    lastUserMessageIndex = i;
+                    break;
+                }
+            }
+
+            if (lastUserMessageIndex !== -1) {
+                const userMessage = messages[lastUserMessageIndex];
+                const imagePart = {
+                    type: "image",
+                    source: {
+                        type: "base64",
+                        media_type: "image/jpeg", // Assuming JPEG
+                        data: imageData.toString('base64')
+                    }
+                };
+
+                if (typeof userMessage.content === 'string') {
+                    userMessage.content = [{ type: "text", text: userMessage.content }, imagePart];
+                } else if (Array.isArray(userMessage.content)) {
+                    // If content is already an array, add the image part.
+                    // This handles cases where a user message might already have multiple parts (e.g. multiple text parts, though less common for this bot).
+                    userMessage.content.push(imagePart);
+                } else {
+                     // Fallback or error if content is an unexpected type
+                    console.warn('[Claude] Last user message content is not a string or array. Cannot attach image.');
+                    userMessage.content = [imagePart]; // Or create a new message with just the image if appropriate
+                }
+            } else {
+                console.warn('[Claude] imageData provided, but no user message found to attach it to. Image not sent.');
+                // Optionally, could create a new user message with the image if that's desired behavior.
+                // messages.push({ role: 'user', content: [imagePart] });
+            }
+        }
+
        try {
-            console.log('Awaiting anthropic api response...')
+            console.log('Awaiting anthropic api response...');
+            // console.log('Formatted Messages for API:', JSON.stringify(messages, null, 2));
+            // console.log('System prompt for API:', systemMessage);
+
            if (!this.params.max_tokens) {
                if (this.params.thinking?.budget_tokens) {
                    this.params.max_tokens = this.params.thinking.budget_tokens + 1000;
@ -30,9 +78,9 @@ export class Claude {
                }
            }
            const resp = await this.anthropic.messages.create({
-                model: this.model_name || "claude-3-sonnet-20240229",
+                model: this.model_name || "claude-3-sonnet-20240229", // Default to a vision-capable model if none specified
                system: systemMessage,
-                messages: messages,
+                messages: messages, // messages array is now potentially modified with image data
                ...(this.params || {})
            });

--- a/src/models/deepseek.js
+++ b/src/models/deepseek.js
@ -13,13 +13,65 @@ export class DeepSeek {
        config.apiKey = getKey('DEEPSEEK_API_KEY');

        this.openai = new OpenAIApi(config);
+        this.supportsRawImageInput = true; // Assuming DeepSeek models used can support this OpenAI-like format
    }

-    async sendRequest(turns, systemMessage, stop_seq='***') {
+    async sendRequest(turns, systemMessage, imageData = null, stop_seq = '***') {
        let messages = [{'role': 'system', 'content': systemMessage}].concat(turns);
-
        messages = strictFormat(messages);

+        if (imageData) {
+            console.warn(`[DeepSeek] imageData provided. Ensure the configured DeepSeek model ('${this.model_name || "deepseek-chat"}') is vision-capable.`);
+
+            let lastUserMessageIndex = -1;
+            for (let i = messages.length - 1; i >= 0; i--) {
+                if (messages[i].role === 'user') {
+                    lastUserMessageIndex = i;
+                    break;
+                }
+            }
+
+            if (lastUserMessageIndex !== -1) {
+                const userMessage = messages[lastUserMessageIndex];
+                const originalContent = userMessage.content; // Should be a string
+
+                if (typeof originalContent === 'string') {
+                    userMessage.content = [
+                        { type: "text", text: originalContent },
+                        {
+                            type: "image_url",
+                            image_url: {
+                                url: `data:image/jpeg;base64,${imageData.toString('base64')}`
+                            }
+                        }
+                    ];
+                } else {
+                    // If content is already an array (e.g. from a previous modification or different source)
+                    // We'd need a more robust way to handle this, but for now, assume it's a string
+                    // or log an error/warning.
+                    console.warn('[DeepSeek] Last user message content was not a simple string. Attempting to add image, but structure might be unexpected.');
+                    if(Array.isArray(originalContent)) {
+                        originalContent.push({
+                            type: "image_url",
+                            image_url: { url: `data:image/jpeg;base64,${imageData.toString('base64')}` }
+                        });
+                        userMessage.content = originalContent;
+                    } else { // Fallback if it's some other type, just overwrite with new structure
+                         userMessage.content = [
+                            { type: "text", text: String(originalContent) }, // Attempt to stringify
+                            {
+                                type: "image_url",
+                                image_url: { url: `data:image/jpeg;base64,${imageData.toString('base64')}` }
+                            }
+                        ];
+                    }
+                }
+            } else {
+                console.warn('[DeepSeek] imageData provided, but no user message found to attach it to. Image not sent.');
+                // Or: messages.push({ role: 'user', content: [ { type: "image_url", image_url: { url: ... } } ] });
+            }
+        }
+
        const pack = {
            model: this.model_name || "deepseek-chat",
            messages,
@ -29,12 +81,12 @@ export class DeepSeek {

        let res = null;
        try {
-            console.log('Awaiting deepseek api response...')
-            // console.log('Messages:', messages);
+            console.log('Awaiting deepseek api response...');
+            // console.log('Formatted Messages for API:', JSON.stringify(messages, null, 2));
            let completion = await this.openai.chat.completions.create(pack);
            if (completion.choices[0].finish_reason == 'length')
                throw new Error('Context length exceeded');
-            console.log('Received.')
+            console.log('Received.');
            res = completion.choices[0].message.content;
        }
        catch (err) {
--- a/src/models/gemini.js
+++ b/src/models/gemini.js
@ -31,9 +31,10 @@ export class Gemini {
        ];

        this.genAI = new GoogleGenerativeAI(getKey('GEMINI_API_KEY'));
+        this.supportsRawImageInput = true;
    }

-    async sendRequest(turns, systemMessage) {
+    async sendRequest(turns, systemMessage, imageData = null) {
        let model;
        const modelConfig = {
            model: this.model_name || "gemini-1.5-flash",
@ -64,6 +65,24 @@ export class Gemini {
            });
        }

+        if (imageData && contents.length > 0) {
+            const lastContent = contents[contents.length - 1];
+            if (lastContent.role === 'user') { // Ensure the image is added to a user turn
+                lastContent.parts.push({
+                    inline_data: {
+                        mime_type: 'image/jpeg',
+                        data: imageData.toString('base64')
+                    }
+                });
+            } else {
+                // This case should ideally not happen if imageData is tied to a user message.
+                // If it does, we could append a new user turn with the image,
+                // or log a warning and send without the image.
+                // For now, let's assume the last message is the user's if imageData is present.
+                console.warn('[Gemini] imageData provided, but the last content entry was not from a user. Image not sent.');
+            }
+        }
+
        const result = await model.generateContent({
            contents,
            generationConfig: {
--- a/src/models/glhf.js
+++ b/src/models/glhf.js
@ -12,9 +12,15 @@ export class GLHF {
            apiKey,
            baseURL: url || "https://glhf.chat/api/openai/v1"
        });
+        // Direct image data in sendRequest is not supported by this wrapper.
+        // Specific vision models/methods should be used if available through the service.
+        this.supportsRawImageInput = false;
    }

-    async sendRequest(turns, systemMessage, stop_seq = '***') {
+    async sendRequest(turns, systemMessage, imageData = null, stop_seq = '***') {
+        if (imageData) {
+            console.warn(`[GLHF] Warning: imageData provided to sendRequest, but this method in glhf.js does not support direct image data embedding for model ${this.model_name}. The image will be ignored.`);
+        }
        // Construct the message array for the API request.
        let messages = [{ role: 'system', content: systemMessage }].concat(turns);
        const pack = {
@ -50,7 +56,8 @@ export class GLHF {
            } catch (err) {
                if ((err.message === 'Context length exceeded' || err.code === 'context_length_exceeded') && turns.length > 1) {
                    console.log('Context length exceeded, trying again with shorter context.');
-                    return await this.sendRequest(turns.slice(1), systemMessage, stop_seq);
+                    // Pass imageData along in recursive call, though it will be ignored again
+                    return await this.sendRequest(turns.slice(1), systemMessage, imageData, stop_seq);
                } else {
                    console.error(err);
                    finalRes = 'My brain disconnected, try again.';
--- a/src/models/gpt.js
+++ b/src/models/gpt.js
@ -17,11 +17,45 @@ export class GPT {
        config.apiKey = getKey('OPENAI_API_KEY');

        this.openai = new OpenAIApi(config);
+        this.supportsRawImageInput = true;
    }

-    async sendRequest(turns, systemMessage, stop_seq='***') {
+    async sendRequest(turns, systemMessage, imageData = null, stop_seq = '***') {
        let messages = [{'role': 'system', 'content': systemMessage}].concat(turns);
        messages = strictFormat(messages);
+
+        if (imageData) {
+            const visionModels = ["gpt-4-vision-preview", "gpt-4o", "gpt-4-turbo"];
+            if (!visionModels.some(vm => this.model_name.includes(vm))) {
+                console.warn(`[GPT] Warning: imageData provided for model ${this.model_name}, which is not explicitly a vision model. The image may be ignored or cause an error.`);
+            }
+
+            let lastUserMessageIndex = -1;
+            for (let i = messages.length - 1; i >= 0; i--) {
+                if (messages[i].role === 'user') {
+                    lastUserMessageIndex = i;
+                    break;
+                }
+            }
+
+            if (lastUserMessageIndex !== -1) {
+                const originalContent = messages[lastUserMessageIndex].content;
+                messages[lastUserMessageIndex].content = [
+                    { type: "text", text: originalContent },
+                    {
+                        type: "image_url",
+                        image_url: {
+                            url: `data:image/jpeg;base64,${imageData.toString('base64')}`
+                        }
+                    }
+                ];
+            } else {
+                // No user message to attach image to, log warning or prepend a new one?
+                // For now, log a warning. Prompter should ensure user message exists if imagePath is set.
+                console.warn('[GPT] imageData provided, but no user message found to attach it to. Image not sent.');
+            }
+        }
+
        const pack = {
            model: this.model_name || "gpt-3.5-turbo",
            messages,
@ -35,12 +69,12 @@ export class GPT {
        let res = null;

        try {
-            console.log('Awaiting openai api response from model', this.model_name)
-            // console.log('Messages:', messages);
+            console.log('Awaiting openai api response from model', this.model_name);
+            // console.log('Formatted Messages for API:', JSON.stringify(messages, null, 2));
            let completion = await this.openai.chat.completions.create(pack);
            if (completion.choices[0].finish_reason == 'length')
                throw new Error('Context length exceeded');
-            console.log('Received.')
+            console.log('Received.');
            res = completion.choices[0].message.content;
        }
        catch (err) {
--- a/src/models/grok.js
+++ b/src/models/grok.js
@ -17,9 +17,15 @@ export class Grok {
        config.apiKey = getKey('XAI_API_KEY');

        this.openai = new OpenAIApi(config);
+        // Direct image data in sendRequest is not supported by this wrapper for standard chat.
+        // Grok may have specific vision capabilities, but this method assumes text-only.
+        this.supportsRawImageInput = false;
    }

-    async sendRequest(turns, systemMessage, stop_seq='***') {
+    async sendRequest(turns, systemMessage, imageData = null, stop_seq='***') {
+        if (imageData) {
+            console.warn(`[Grok] Warning: imageData provided to sendRequest, but this method in grok.js does not support direct image data embedding for model ${this.model_name}. The image will be ignored.`);
+        }
        let messages = [{'role': 'system', 'content': systemMessage}].concat(turns);

        const pack = {
@ -42,7 +48,7 @@ export class Grok {
        catch (err) {
            if ((err.message == 'Context length exceeded' || err.code == 'context_length_exceeded') && turns.length > 1) {
                console.log('Context length exceeded, trying again with shorter context.');
-                return await this.sendRequest(turns.slice(1), systemMessage, stop_seq);
+                return await this.sendRequest(turns.slice(1), systemMessage, imageData, stop_seq);
            } else if (err.message.includes('The model expects a single `text` element per message.')) {
                console.log(err);
                res = 'Vision is only supported by certain models.';
--- a/src/models/groq.js
+++ b/src/models/groq.js
@ -23,11 +23,16 @@ export class GroqCloudAPI {
            console.warn("Groq Cloud has no implementation for custom URLs. Ignoring provided URL.");

        this.groq = new Groq({ apiKey: getKey('GROQCLOUD_API_KEY') });
-
+        // Direct image data in sendRequest is not supported by this wrapper.
+        // Groq may offer specific vision models/APIs, but this standard chat method assumes text.
+        this.supportsRawImageInput = false;

    }

-    async sendRequest(turns, systemMessage, stop_seq = null) {
+    async sendRequest(turns, systemMessage, imageData = null, stop_seq = null) {
+        if (imageData) {
+            console.warn(`[Groq] Warning: imageData provided to sendRequest, but this method in groq.js does not support direct image data embedding for model ${this.model_name}. The image will be ignored.`);
+        }
        // Construct messages array
        let messages = [{"role": "system", "content": systemMessage}].concat(turns);

@ -86,7 +91,8 @@ export class GroqCloudAPI {
            ]
        });
        
-        return this.sendRequest(imageMessages);
+        // sendVisionRequest formats its own message array; sendRequest here should not process new imageData.
+        return this.sendRequest(imageMessages, systemMessage, null, stop_seq);
    }

    async embed(_) {
--- a/src/models/huggingface.js
+++ b/src/models/huggingface.js
@ -14,9 +14,15 @@ export class HuggingFace {
    }

    this.huggingface = new HfInference(getKey('HUGGINGFACE_API_KEY'));
+    // Direct image data in sendRequest is not supported by this wrapper.
+    // HuggingFace Inference API has other methods for vision tasks.
+    this.supportsRawImageInput = false;
  }

-  async sendRequest(turns, systemMessage) {
+  async sendRequest(turns, systemMessage, imageData = null) {
+    if (imageData) {
+      console.warn(`[HuggingFace] Warning: imageData provided to sendRequest, but this method in huggingface.js does not support direct image data embedding for model ${this.model_name}. The image will be ignored.`);
+    }
    const stop_seq = '***';
    // Build a single prompt from the conversation turns
    const prompt = toSinglePrompt(turns, null, stop_seq);
--- a/src/models/hyperbolic.js
+++ b/src/models/hyperbolic.js
@ -5,26 +5,20 @@ export class Hyperbolic {
        this.modelName = modelName || "deepseek-ai/DeepSeek-V3";
        this.apiUrl = apiUrl || "https://api.hyperbolic.xyz/v1/chat/completions";

-        // Retrieve the Hyperbolic API key from keys.js
        this.apiKey = getKey('HYPERBOLIC_API_KEY');
        if (!this.apiKey) {
            throw new Error('HYPERBOLIC_API_KEY not found. Check your keys.js file.');
        }
+        // Direct image data in sendRequest is not supported by this wrapper.
+        this.supportsRawImageInput = false;
    }

-    /**
-     * Sends a chat completion request to the Hyperbolic endpoint.
-     *
-     * @param {Array} turns - An array of message objects, e.g. [{role: 'user', content: 'Hi'}].
-     * @param {string} systemMessage - The system prompt or instruction.
-     * @param {string} stopSeq - A stopping sequence, default '***'.
-     * @returns {Promise<string>} - The model's reply.
-     */
-    async sendRequest(turns, systemMessage, stopSeq = '***') {
-        // Prepare the messages with a system prompt at the beginning
+    async sendRequest(turns, systemMessage, imageData = null, stopSeq = '***') {
+        if (imageData) {
+            console.warn(`[Hyperbolic] Warning: imageData provided to sendRequest, but this method in hyperbolic.js does not support direct image data embedding for model ${this.modelName}. The image will be ignored.`);
+        }
        const messages = [{ role: 'system', content: systemMessage }, ...turns];

-        // Build the request payload
        const payload = {
            model: this.modelName,
            messages: messages,
@ -32,7 +26,14 @@ export class Hyperbolic {
            temperature: 0.7,
            top_p: 0.9,
            stream: false
+            // stop: stopSeq, // Hyperbolic API might not support stop sequences in the same way or at all.
+                           // If it does, it might need to be formatted differently or might not be part of standard payload.
+                           // For now, commenting out if it causes issues or is not standard.
        };
+        if (stopSeq && stopSeq !== '***') { // Only add stop if it's meaningful and not the default placeholder
+            payload.stop = stopSeq;
+        }
+

        const maxAttempts = 5;
        let attempt = 0;
@ -41,7 +42,7 @@ export class Hyperbolic {
        while (attempt < maxAttempts) {
            attempt++;
            console.log(`Awaiting Hyperbolic API response... (attempt: ${attempt})`);
-            console.log('Messages:', messages);
+            // console.log('Messages:', messages); // Avoid logging full messages in production if sensitive

            let completionContent = null;

@ -56,7 +57,12 @@ export class Hyperbolic {
                });

                if (!response.ok) {
-                    throw new Error(`HTTP error! status: ${response.status}`);
+                    // Attempt to read error body for more details
+                    let errorBody = "No additional error details.";
+                    try {
+                        errorBody = await response.text();
+                    } catch (e) { /* ignore if error body can't be read */ }
+                    throw new Error(`HTTP error! status: ${response.status}, message: ${errorBody}`);
                }

                const data = await response.json();
@ -68,24 +74,28 @@ export class Hyperbolic {
                console.log('Received response from Hyperbolic.');
            } catch (err) {
                if (
-                    (err.message === 'Context length exceeded' || err.code === 'context_length_exceeded') &&
+                    (err.message.includes('Context length exceeded') || err.code === 'context_length_exceeded') && // Adjusted to check includes for message
                    turns.length > 1
                ) {
                    console.log('Context length exceeded, trying again with a shorter context...');
-                    return await this.sendRequest(turns.slice(1), systemMessage, stopSeq);
+                    return await this.sendRequest(turns.slice(1), systemMessage, imageData, stopSeq); // Pass imageData
                } else {
                    console.error(err);
                    completionContent = 'My brain disconnected, try again.';
+                    // No break here, let it be set and then break after the think block logic
                }
            }

-            // Check for <think> blocks
            const hasOpenTag = completionContent.includes("<think>");
            const hasCloseTag = completionContent.includes("</think>");

            if ((hasOpenTag && !hasCloseTag)) {
                console.warn("Partial <think> block detected. Re-generating...");
-                continue; // Retry the request
+                if (attempt >= maxAttempts) { // If this was the last attempt
+                    finalRes = "I thought too hard and got stuck in a loop, sorry, try again.";
+                    break;
+                }
+                continue;
            }

            if (hasCloseTag && !hasOpenTag) {
@ -97,10 +107,10 @@ export class Hyperbolic {
            }

            finalRes = completionContent.replace(/<\|separator\|>/g, '*no response*');
-            break; // Valid response obtained—exit loop
+            break;
        }

-        if (finalRes == null) {
+        if (finalRes == null) { // This condition might be hit if all attempts fail and continue
            console.warn("Could not get a valid <think> block or normal response after max attempts.");
            finalRes = 'I thought too hard, sorry, try again.';
        }
--- a/src/models/local.js
+++ b/src/models/local.js
@ -7,13 +7,37 @@ export class Local {
        this.url = url || 'http://127.0.0.1:11434';
        this.chat_endpoint = '/api/chat';
        this.embedding_endpoint = '/api/embeddings';
+        // Note: Actual multimodal support depends on the specific Ollama model (e.g., LLaVA, BakLLaVA)
+        this.supportsRawImageInput = true;
    }

-    async sendRequest(turns, systemMessage) {
+    async sendRequest(turns, systemMessage, imageData = null) {
        let model = this.model_name || 'sweaterdog/andy-4:latest'; // Changed to Andy-4
        let messages = strictFormat(turns);
        messages.unshift({ role: 'system', content: systemMessage });

+        if (imageData) {
+            console.warn(`[Ollama] imageData provided. Ensure the configured Ollama model ('${model}') is multimodal (e.g., llava, bakllava) to process images.`);
+            let lastUserMessageIndex = -1;
+            for (let i = messages.length - 1; i >= 0; i--) {
+                if (messages[i].role === 'user') {
+                    lastUserMessageIndex = i;
+                    break;
+                }
+            }
+
+            if (lastUserMessageIndex !== -1) {
+                if (!messages[lastUserMessageIndex].images) {
+                    messages[lastUserMessageIndex].images = [];
+                }
+                messages[lastUserMessageIndex].images.push(imageData.toString('base64'));
+            } else {
+                console.warn('[Ollama] imageData provided, but no user message found to attach it to. Image not sent.');
+                // Or, could create a new user message:
+                // messages.push({ role: 'user', content: "Image attached.", images: [imageData.toString('base64')] });
+            }
+        }
+        
        // We'll attempt up to 5 times for models with deepseek-r1-esk reasoning if the <think> tags are mismatched.
        const maxAttempts = 5;
        let attempt = 0;
--- a/src/models/mistral.js
+++ b/src/models/mistral.js
@ -23,6 +23,7 @@ export class Mistral {
                apiKey: getKey("MISTRAL_API_KEY")
            }
        );
+        this.supportsRawImageInput = false; // Standard chat completions may not support raw images for all models.

        
        // Prevents the following code from running when model not specified
@ -35,7 +36,11 @@ export class Mistral {
        }
    }

-    async sendRequest(turns, systemMessage) {
+    async sendRequest(turns, systemMessage, imageData = null) {
+        if (imageData) {
+            console.warn(`[Mistral] Warning: imageData provided to sendRequest, but this method in mistral.js currently does not support direct image data embedding for model ${this.model_name}. The image will be ignored. Use sendVisionRequest for models/endpoints that support vision, or ensure the API/model used by sendRequest can handle images in its standard chat format.`);
+            // imageData is ignored for now.
+        }

        let result;

--- a/src/models/novita.js
+++ b/src/models/novita.js
@ -16,15 +16,20 @@ export class Novita {
    config.apiKey = getKey('NOVITA_API_KEY');

    this.openai = new OpenAIApi(config);
+    // Direct image data in sendRequest is not supported by this wrapper.
+    this.supportsRawImageInput = false;
  }

-	async sendRequest(turns, systemMessage, stop_seq='***') {
-      let messages = [{'role': 'system', 'content': systemMessage}].concat(turns);
+	async sendRequest(turns, systemMessage, imageData = null, stop_seq='***') {
+    if (imageData) {
+      console.warn(`[Novita] Warning: imageData provided to sendRequest, but this method in novita.js does not support direct image data embedding for model ${this.model_name}. The image will be ignored.`);
+    }
+    let messages = [{'role': 'system', 'content': systemMessage}].concat(turns);

      
-      messages = strictFormat(messages);
+    messages = strictFormat(messages);
      
-      const pack = {
+    const pack = {
          model: this.model_name || "meta-llama/llama-3.1-70b-instruct",
          messages,
          stop: [stop_seq],
@ -43,7 +48,7 @@ export class Novita {
      catch (err) {
          if ((err.message == 'Context length exceeded' || err.code == 'context_length_exceeded') && turns.length > 1) {
              console.log('Context length exceeded, trying again with shorter context.');
-              return await sendRequest(turns.slice(1), systemMessage, stop_seq);
+              return await this.sendRequest(turns.slice(1), systemMessage, imageData, stop_seq); // Added this. and imageData
          } else {
            console.log(err);
              res = 'My brain disconnected, try again.';
--- a/src/models/openrouter.js
+++ b/src/models/openrouter.js
@ -18,9 +18,15 @@ export class OpenRouter {
        config.apiKey = apiKey; 

        this.openai = new OpenAIApi(config);
+        // OpenRouter is a router; individual models might support vision.
+        // This generic sendRequest does not format for vision. Use sendVisionRequest or specific model logic.
+        this.supportsRawImageInput = false;
    }

-    async sendRequest(turns, systemMessage, stop_seq='*') {
+    async sendRequest(turns, systemMessage, imageData = null, stop_seq='*') {
+        if (imageData) {
+            console.warn(`[OpenRouter] Warning: imageData provided to sendRequest. While OpenRouter can route to vision models, this generic method does not format for image data. The image will be ignored. Use sendVisionRequest or ensure your model call through OpenRouter is specifically formatted for vision if needed.`);
+        }
        let messages = [{ role: 'system', content: systemMessage }, ...turns];
        messages = strictFormat(messages);

@ -67,7 +73,9 @@ export class OpenRouter {
            ]
        });
        
-        return this.sendRequest(imageMessages, systemMessage);
+        // sendVisionRequest formats its own message array; sendRequest here should not process new imageData.
+        // Pass systemMessage and stop_seq as originally intended by sendRequest.
+        return this.sendRequest(imageMessages, systemMessage, null, stop_seq);
    }

    async embed(text) {
--- a/src/models/prompter.js
+++ b/src/models/prompter.js
@ -334,9 +334,29 @@ export class Prompter {
            let prompt = this.profile.conversing;
            prompt = await this.replaceStrings(prompt, messages, this.convo_examples);
            let generation;
+            let imageData = null;
+
+            if (settings.vision_mode === 'always' && messages.length > 0) {
+                const lastMessage = messages[messages.length - 1];
+                // Check if the last message has an imagePath and if the model supports raw image input
+                if (lastMessage.imagePath && this.chat_model.supportsRawImageInput) {
+                    try {
+                        // Construct the full path to the image file
+                        const agentScreenshotDir = path.join('bots', this.agent.name, 'screenshots');
+                        const imageFullPath = path.join(agentScreenshotDir, lastMessage.imagePath);
+
+                        console.log(`[Prompter] Attempting to read image for always_active mode: ${imageFullPath}`);
+                        imageData = await fs.readFile(imageFullPath); // Read as buffer
+                        console.log('[Prompter] Image data prepared for chat model.');
+                    } catch (err) {
+                        console.error(`[Prompter] Error reading image file ${lastMessage.imagePath}:`, err);
+                        imageData = null; // Proceed without image data if reading fails
+                    }
+                }
+            }

            try {
-                generation = await this.chat_model.sendRequest(messages, prompt);
+                generation = await this.chat_model.sendRequest(messages, prompt, imageData);
                if (typeof generation !== 'string') {
                    console.error('Error: Generated response is not a string', generation);
                    throw new Error('Generated response is not a string');
--- a/src/models/qwen.js
+++ b/src/models/qwen.js
@ -12,15 +12,51 @@ export class Qwen {
        config.apiKey = getKey('QWEN_API_KEY');

        this.openai = new OpenAIApi(config);
+        // Note: Actual multimodal support depends on the specific Qwen model (e.g., qwen-vl-plus)
+        this.supportsRawImageInput = true;
    }

-    async sendRequest(turns, systemMessage, stop_seq='***') {
+    async sendRequest(turns, systemMessage, imageData = null, stop_seq = '***') {
        let messages = [{'role': 'system', 'content': systemMessage}].concat(turns);
-
        messages = strictFormat(messages);

+        if (imageData) {
+            // Qwen VL models include names like "qwen-vl-plus", "qwen-vl-max", "qwen-vl-chat-v1"
+            if (!this.model_name || !this.model_name.toLowerCase().includes('-vl')) {
+                console.warn(`[Qwen] Warning: imageData provided for model ${this.model_name}, which does not appear to be a Qwen Vision-Language (VL) model. The image may be ignored or cause an error.`);
+            }
+
+            let lastUserMessageIndex = -1;
+            for (let i = messages.length - 1; i >= 0; i--) {
+                if (messages[i].role === 'user') {
+                    lastUserMessageIndex = i;
+                    break;
+                }
+            }
+
+            if (lastUserMessageIndex !== -1) {
+                const userMessage = messages[lastUserMessageIndex];
+                if (typeof userMessage.content === 'string') { // Ensure content is a string before converting
+                    userMessage.content = [
+                        { "text": userMessage.content },
+                        { "image": `data:image/jpeg;base64,${imageData.toString('base64')}` }
+                    ];
+                } else if (Array.isArray(userMessage.content)) {
+                    // If content is already an array (e.g. from previous image), add new image
+                     userMessage.content.push({ "image": `data:image/jpeg;base64,${imageData.toString('base64')}` });
+                } else {
+                    console.warn('[Qwen] Last user message content is not a string or array. Creating new content array for image.');
+                    userMessage.content = [{ "image": `data:image/jpeg;base64,${imageData.toString('base64')}` }];
+                }
+            } else {
+                console.warn('[Qwen] imageData provided, but no user message found to attach it to. Image not sent.');
+                // Alternative: Create a new user message with the image
+                // messages.push({ role: 'user', content: [{ "image": `data:image/jpeg;base64,${imageData.toString('base64')}` }] });
+            }
+        }
+
        const pack = {
-            model: this.model_name || "qwen-plus",
+            model: this.model_name || "qwen-plus", // Default might need to be a VL model if images are common
            messages,
            stop: stop_seq,
            ...(this.params || {})
--- a/src/models/replicate.js
+++ b/src/models/replicate.js
@ -16,9 +16,15 @@ export class ReplicateAPI {
 		this.replicate = new Replicate({
 			auth: getKey('REPLICATE_API_KEY'),
 		});
+		// Direct image data in sendRequest is not supported by this wrapper.
+		// Replicate handles vision models differently, often with specific inputs like "image".
+		this.supportsRawImageInput = false;
 	}

-	async sendRequest(turns, systemMessage) {
+	async sendRequest(turns, systemMessage, imageData = null) {
+		if (imageData) {
+			console.warn(`[ReplicateAPI] Warning: imageData provided to sendRequest, but this method in replicate.js does not support direct image data embedding for model ${this.model_name}. The image will be ignored. Replicate models with vision capabilities usually require specific input fields like 'image' with a URL or base64 string.`);
+		}
 		const stop_seq = '***';
 		const prompt = toSinglePrompt(turns, null, stop_seq);
 		let model_name = this.model_name || 'meta/meta-llama-3-70b-instruct';
--- a/src/models/vllm.js
+++ b/src/models/vllm.js
@ -19,9 +19,15 @@ export class VLLM {
        vllm_config.apiKey = ""

        this.vllm = new OpenAIApi(vllm_config);
+        // VLLM can serve various models. This generic sendRequest does not format for vision.
+        // Specific multimodal models served via VLLM might require custom request formatting.
+        this.supportsRawImageInput = false;
    }

-    async sendRequest(turns, systemMessage, stop_seq = '***') {
+    async sendRequest(turns, systemMessage, imageData = null, stop_seq = '***') {
+        if (imageData) {
+            console.warn(`[VLLM] Warning: imageData provided to sendRequest, but this method in vllm.js does not support direct image data embedding for model ${this.model_name}. The image will be ignored. Ensure the VLLM endpoint is configured for a multimodal model and the request is formatted accordingly if vision is intended.`);
+        }
        let messages = [{ 'role': 'system', 'content': systemMessage }].concat(turns);
        
        if (this.model_name.includes('deepseek') || this.model_name.includes('qwen')) {
@ -47,7 +53,7 @@ export class VLLM {
        catch (err) {
            if ((err.message == 'Context length exceeded' || err.code == 'context_length_exceeded') && turns.length > 1) {
                console.log('Context length exceeded, trying again with shorter context.');
-                return await this.sendRequest(turns.slice(1), systemMessage, stop_seq);
+                return await this.sendRequest(turns.slice(1), systemMessage, imageData, stop_seq);
            } else {
                console.log(err);
                res = 'My brain disconnected, try again.';