diff --git a/settings.js b/settings.js index 380e5b9..a2757eb 100644 --- a/settings.js +++ b/settings.js @@ -35,6 +35,7 @@ const settings = { "allow_insecure_coding": false, // allows newAction command and model can write/run code on your computer. enable at own risk "allow_vision": false, // allows vision model to interpret screenshots as inputs + "vision_mode": "on", // "off", "on", or "always_active" "blocked_actions" : ["!checkBlueprint", "!checkBlueprintLevel", "!getBlueprint", "!getBlueprintLevel"] , // commands to disable and remove from docs. Ex: ["!setMode"] "code_timeout_mins": -1, // minutes code is allowed to run. -1 for no timeout "relevant_docs_count": 5, // number of relevant code function docs to select for prompting. -1 for all diff --git a/src/agent/agent.js b/src/agent/agent.js index 3cd671b..bbaabdd 100644 --- a/src/agent/agent.js +++ b/src/agent/agent.js @@ -20,6 +20,7 @@ import { say } from './speak.js'; export class Agent { async start(profile_fp, load_mem=false, init_message=null, count_id=0, task_path=null, task_id=null) { this.last_sender = null; + this.latestScreenshotPath = null; this.count_id = count_id; if (!profile_fp) { throw new Error('No profile filepath provided'); @@ -116,7 +117,7 @@ export class Agent { this.checkAllPlayersPresent(); console.log('Initializing vision intepreter...'); - this.vision_interpreter = new VisionInterpreter(this, settings.allow_vision); + this.vision_interpreter = new VisionInterpreter(this, settings.vision_mode); } catch (error) { console.error('Error in spawn event:', error); @@ -172,7 +173,8 @@ export class Agent { if (save_data?.self_prompt) { if (init_message) { - this.history.add('system', init_message); + // Assuming init_message for self_prompt loading doesn't have an image + await this.history.add('system', init_message, null); } await this.self_prompter.handleLoad(save_data.self_prompt, save_data.self_prompting_state); } @@ -246,6 +248,15 @@ export class Agent { const from_other_bot = convoManager.isOtherAgent(source); if (!self_prompt && !from_other_bot) { // from user, check for forced commands + if (settings.vision_mode === 'always_active' && this.vision_interpreter && this.vision_interpreter.camera) { + try { + const screenshotFilename = await this.vision_interpreter.camera.capture(); + this.latestScreenshotPath = screenshotFilename; + console.log(`[${this.name}] Captured screenshot in always_active mode: ${screenshotFilename}`); + } catch (error) { + console.error(`[${this.name}] Error capturing screenshot in always_active mode:`, error); + } + } const user_command_name = containsCommand(message); if (user_command_name) { if (!commandExists(user_command_name)) { @@ -256,7 +267,16 @@ export class Agent { if (user_command_name === '!newAction') { // all user-initiated commands are ignored by the bot except for this one // add the preceding message to the history to give context for newAction - this.history.add(source, message); + // This is the user's message that contains the !newAction command. + // If a screenshot was taken due to always_active, it should be associated here. + let imagePathForNewActionCmd = null; + if (settings.vision_mode === 'always_active' && this.latestScreenshotPath && !self_prompt && !from_other_bot) { + imagePathForNewActionCmd = this.latestScreenshotPath; + } + await this.history.add(source, message, imagePathForNewActionCmd); + if (imagePathForNewActionCmd) { + this.latestScreenshotPath = null; // Consume path + } } let execute_res = await executeCommand(this, message); if (execute_res) @@ -281,11 +301,29 @@ export class Agent { behavior_log = '...' + behavior_log.substring(behavior_log.length - MAX_LOG); } behavior_log = 'Recent behaviors log: \n' + behavior_log; - await this.history.add('system', behavior_log); + await this.history.add('system', behavior_log, null); // Behavior log unlikely to have an image } - // Handle other user messages - await this.history.add(source, message); + // Handle other user messages (or initial system messages) + let imagePathForInitialMessage = null; + if (!self_prompt && !from_other_bot) { + // If it's a user message and a screenshot was auto-captured for always_active + if (settings.vision_mode === 'always_active' && this.latestScreenshotPath) { + imagePathForInitialMessage = this.latestScreenshotPath; + } + } else if (source === 'system' && this.latestScreenshotPath && message.startsWith("You died at position")) { + // Example: System death message might use a path if set by some (future) death-capture logic + // For now, this is illustrative; death messages don't set latestScreenshotPath. + // More relevant if a system message is a direct consequence of an action that *did* set the path. + // However, explicit command result handling is better for those. + // imagePathForInitialMessage = this.latestScreenshotPath; // Generally, system messages here won't have an image unless specific logic sets it. + } + + + await this.history.add(source, message, imagePathForInitialMessage); + if (imagePathForInitialMessage) { + this.latestScreenshotPath = null; // Consume the path if used + } this.history.save(); if (!self_prompt && this.self_prompter.isActive()) // message is from user during self-prompting @@ -306,10 +344,12 @@ export class Agent { if (command_name) { // contains query or command res = truncCommandMessage(res); // everything after the command is ignored - this.history.add(this.name, res); + // Agent's own message stating the command it will execute + await this.history.add(this.name, res, null); if (!commandExists(command_name)) { - this.history.add('system', `Command ${command_name} does not exist.`); + // Agent hallucinated a command + await this.history.add('system', `Command ${command_name} does not exist.`, null); console.warn('Agent hallucinated command:', command_name) continue; } @@ -333,13 +373,24 @@ export class Agent { console.log('Agent executed:', command_name, 'and got:', execute_res); used_command = true; - if (execute_res) - this.history.add('system', execute_res); - else + if (execute_res) { + let imagePathForCommandResult = null; + // Vision commands (!lookAtPlayer, !lookAtPosition) set latestScreenshotPath in VisionInterpreter. + // This is relevant if mode is 'on' (analysis done, path stored by VI) or 'always_active' (screenshot taken, path stored by VI). + if (command_name && (command_name === '!lookAtPlayer' || command_name === '!lookAtPosition') && this.latestScreenshotPath) { + imagePathForCommandResult = this.latestScreenshotPath; + } + await this.history.add('system', execute_res, imagePathForCommandResult); + if (imagePathForCommandResult) { + this.latestScreenshotPath = null; // Consume the path + } + } + else { // command execution didn't return anything or failed in a way that implies loop break break; + } } - else { // conversation response - this.history.add(this.name, res); + else { // conversation response (no command) + await this.history.add(this.name, res, null); // Agent's text response, no image typically this.routeResponse(source, res); break; } @@ -488,7 +539,8 @@ export class Agent { cleanKill(msg='Killing agent process...', code=1) { - this.history.add('system', msg); + // Assuming cleanKill messages don't have images + await this.history.add('system', msg, null); this.bot.chat(code > 1 ? 'Restarting.': 'Exiting.'); this.history.save(); process.exit(code); @@ -497,7 +549,8 @@ export class Agent { if (this.task.data) { let res = this.task.isDone(); if (res) { - await this.history.add('system', `Task ended with score : ${res.score}`); + // Assuming task end messages don't have images + await this.history.add('system', `Task ended with score : ${res.score}`, null); await this.history.save(); // await new Promise(resolve => setTimeout(resolve, 3000)); // Wait 3 second for save to complete console.log('Task finished:', res.message); diff --git a/src/agent/commands/actions.js b/src/agent/commands/actions.js index b2b3ccb..c5fb1dc 100644 --- a/src/agent/commands/actions.js +++ b/src/agent/commands/actions.js @@ -428,6 +428,13 @@ export const actionsList = [ } }, perform: async function(agent, player_name, direction) { + if (agent.vision_interpreter && agent.vision_interpreter.vision_mode === 'off') { + return "Vision commands are disabled as vision mode is 'off'."; + } + // Also check if vision_interpreter or camera is not available if mode is not 'off' + if (agent.vision_interpreter && !agent.vision_interpreter.camera && agent.vision_interpreter.vision_mode !== 'off') { + return "Camera is not available, cannot perform look command."; + } if (direction !== 'at' && direction !== 'with') { return "Invalid direction. Use 'at' or 'with'."; } @@ -448,6 +455,13 @@ export const actionsList = [ 'z': { type: 'int', description: 'z coordinate' } }, perform: async function(agent, x, y, z) { + if (agent.vision_interpreter && agent.vision_interpreter.vision_mode === 'off') { + return "Vision commands are disabled as vision mode is 'off'."; + } + // Also check if vision_interpreter or camera is not available if mode is not 'off' + if (agent.vision_interpreter && !agent.vision_interpreter.camera && agent.vision_interpreter.vision_mode !== 'off') { + return "Camera is not available, cannot perform look command."; + } let result = ""; const actionFn = async () => { result = await agent.vision_interpreter.lookAtPosition(x, y, z); diff --git a/src/agent/history.js b/src/agent/history.js index 13b9c79..96073de 100644 --- a/src/agent/history.js +++ b/src/agent/history.js @@ -58,7 +58,7 @@ export class History { } } - async add(name, content) { + async add(name, content, imagePath = null) { let role = 'assistant'; if (name === 'system') { role = 'system'; @@ -67,7 +67,7 @@ export class History { role = 'user'; content = `${name}: ${content}`; } - this.turns.push({role, content}); + this.turns.push({role, content, imagePath}); if (this.turns.length >= this.max_messages) { let chunk = this.turns.splice(0, this.summary_chunk_size); diff --git a/src/agent/vision/vision_interpreter.js b/src/agent/vision/vision_interpreter.js index a43acd2..7ae3b18 100644 --- a/src/agent/vision/vision_interpreter.js +++ b/src/agent/vision/vision_interpreter.js @@ -3,19 +3,26 @@ import { Camera } from "./camera.js"; import fs from 'fs'; export class VisionInterpreter { - constructor(agent, allow_vision) { + constructor(agent, vision_mode) { this.agent = agent; - this.allow_vision = allow_vision; + this.vision_mode = vision_mode; this.fp = './bots/'+agent.name+'/screenshots/'; - if (allow_vision) { + if (this.vision_mode !== 'off') { this.camera = new Camera(agent.bot, this.fp); } } async lookAtPlayer(player_name, direction) { - if (!this.allow_vision || !this.agent.prompter.vision_model.sendVisionRequest) { + if (this.vision_mode === 'off') { return "Vision is disabled. Use other methods to describe the environment."; } + if (!this.camera) { + return "Camera is not initialized. Vision may be set to 'off'."; + } + if (!this.agent.prompter.vision_model.sendVisionRequest && this.vision_mode === 'on') { + return "Vision requests are not enabled for the current model. Cannot analyze image."; + } + let result = ""; const bot = this.agent.bot; const player = bot.players[player_name]?.entity; @@ -26,30 +33,51 @@ export class VisionInterpreter { let filename; if (direction === 'with') { await bot.look(player.yaw, player.pitch); - result = `Looking in the same direction as ${player_name}\n`; + result = `Looking in the same direction as ${player_name}.\n`; filename = await this.camera.capture(); + this.agent.latestScreenshotPath = filename; } else { await bot.lookAt(new Vec3(player.position.x, player.position.y + player.height, player.position.z)); - result = `Looking at player ${player_name}\n`; + result = `Looking at player ${player_name}.\n`; filename = await this.camera.capture(); - + this.agent.latestScreenshotPath = filename; } - return result + `Image analysis: "${await this.analyzeImage(filename)}"`; + if (this.vision_mode === 'on') { + return result + `Image analysis: "${await this.analyzeImage(filename)}"`; + } else if (this.vision_mode === 'always_active') { + return result + "Screenshot taken and stored."; + } + // Should not be reached if vision_mode is one of the expected values + return "Error: Unknown vision mode."; } async lookAtPosition(x, y, z) { - if (!this.allow_vision || !this.agent.prompter.vision_model.sendVisionRequest) { + if (this.vision_mode === 'off') { return "Vision is disabled. Use other methods to describe the environment."; } + if (!this.camera) { + return "Camera is not initialized. Vision may be set to 'off'."; + } + if (!this.agent.prompter.vision_model.sendVisionRequest && this.vision_mode === 'on') { + return "Vision requests are not enabled for the current model. Cannot analyze image."; + } + let result = ""; const bot = this.agent.bot; - await bot.lookAt(new Vec3(x, y + 2, z)); - result = `Looking at coordinate ${x}, ${y}, ${z}\n`; + await bot.lookAt(new Vec3(x, y + 2, z)); // lookAt requires y to be eye level, so +2 from feet + result = `Looking at coordinate ${x}, ${y}, ${z}.\n`; let filename = await this.camera.capture(); + this.agent.latestScreenshotPath = filename; - return result + `Image analysis: "${await this.analyzeImage(filename)}"`; + if (this.vision_mode === 'on') { + return result + `Image analysis: "${await this.analyzeImage(filename)}"`; + } else if (this.vision_mode === 'always_active') { + return result + "Screenshot taken and stored."; + } + // Should not be reached if vision_mode is one of the expected values + return "Error: Unknown vision mode."; } getCenterBlockInfo() { diff --git a/src/models/gemini.js b/src/models/gemini.js index 4d24c93..a205753 100644 --- a/src/models/gemini.js +++ b/src/models/gemini.js @@ -31,9 +31,10 @@ export class Gemini { ]; this.genAI = new GoogleGenerativeAI(getKey('GEMINI_API_KEY')); + this.supportsRawImageInput = true; } - async sendRequest(turns, systemMessage) { + async sendRequest(turns, systemMessage, imageData = null) { let model; const modelConfig = { model: this.model_name || "gemini-1.5-flash", @@ -64,6 +65,24 @@ export class Gemini { }); } + if (imageData && contents.length > 0) { + const lastContent = contents[contents.length - 1]; + if (lastContent.role === 'user') { // Ensure the image is added to a user turn + lastContent.parts.push({ + inline_data: { + mime_type: 'image/jpeg', + data: imageData.toString('base64') + } + }); + } else { + // This case should ideally not happen if imageData is tied to a user message. + // If it does, we could append a new user turn with the image, + // or log a warning and send without the image. + // For now, let's assume the last message is the user's if imageData is present. + console.warn('[Gemini] imageData provided, but the last content entry was not from a user. Image not sent.'); + } + } + const result = await model.generateContent({ contents, generationConfig: { diff --git a/src/models/prompter.js b/src/models/prompter.js index e05f5a8..931bef2 100644 --- a/src/models/prompter.js +++ b/src/models/prompter.js @@ -334,9 +334,29 @@ export class Prompter { let prompt = this.profile.conversing; prompt = await this.replaceStrings(prompt, messages, this.convo_examples); let generation; + let imageData = null; + + if (settings.vision_mode === 'always_active' && messages.length > 0) { + const lastMessage = messages[messages.length - 1]; + // Check if the last message has an imagePath and if the model supports raw image input + if (lastMessage.imagePath && this.chat_model.supportsRawImageInput) { + try { + // Construct the full path to the image file + const agentScreenshotDir = path.join('bots', this.agent.name, 'screenshots'); + const imageFullPath = path.join(agentScreenshotDir, lastMessage.imagePath); + + console.log(`[Prompter] Attempting to read image for always_active mode: ${imageFullPath}`); + imageData = await fs.readFile(imageFullPath); // Read as buffer + console.log('[Prompter] Image data prepared for chat model.'); + } catch (err) { + console.error(`[Prompter] Error reading image file ${lastMessage.imagePath}:`, err); + imageData = null; // Proceed without image data if reading fails + } + } + } try { - generation = await this.chat_model.sendRequest(messages, prompt); + generation = await this.chat_model.sendRequest(messages, prompt, imageData); if (typeof generation !== 'string') { console.error('Error: Generated response is not a string', generation); throw new Error('Generated response is not a string');