mindcraft/src/agent/vision/vision_interpreter.js
google-labs-jules[bot] be38f56f12 I've implemented enhanced vision modes with bug fixes and extended API support.
This update finalizes the implementation of three distinct vision modes:
- "off": This disables all my vision capabilities.
- "prompted": (Formerly "on") This allows me to use vision via explicit commands from you (e.g., !lookAtPlayer), and I will then summarize the image.
- "always": (Formerly "always_active") I will automatically take a screenshot every time you send a prompt and send it with your prompt to a multimodal LLM. If you use a look command in this mode, I will only update my view and take a screenshot for the *next* interaction if relevant, without immediate summarization.

Here are the key changes and improvements:

1.  **Bug Fix (Image Path ENOENT)**:
    *   I've corrected `Camera.capture()` so it returns filenames with the `.jpg` extension.
    *   I've updated `VisionInterpreter.analyzeImage()` to handle full filenames.
    *   This resolves the `ENOENT` error that was previously happening in `Prompter.js`.

2.  **Vision Mode Renaming**:
    *   I've renamed the modes in `settings.js` and throughout the codebase: "on" is now "prompted", and "always_active" is now "always".

3.  **Core Framework (from previous work, now integrated)**:
    *   I've added `vision_mode` to `settings.js`.
    *   `Agent.js` now manages `latestScreenshotPath` and initializes `VisionInterpreter` with `vision_mode`.
    *   `VisionInterpreter.js` handles different behaviors for each mode.
    *   My vision commands (`!lookAt...`) respect the `off` mode.
    *   `History.js` stores `imagePath` with turns, and `Agent.js` manages this path's lifecycle.
    *   `Prompter.js` reads image files when I'm in "always" mode and passes `imageData` to model wrappers.

4.  **Extended Multimodal API Support**:
    *   `gemini.js`, `gpt.js`, `claude.js`, `local.js` (Ollama), `qwen.js`, and `deepseek.js` have been updated to accept `imageData` in their `sendRequest` method and format it for their respective multimodal APIs. They now include `supportsRawImageInput = true`.
    *   Other model wrappers (`mistral.js`, `glhf.js`, `grok.js`, etc.) now safely handle the `imageData` parameter in `sendRequest` (by ignoring it and logging a warning) and have `supportsRawImageInput = false` for that method, ensuring consistent behavior.

5.  **Testing**: I have a comprehensive plan to verify all modes and functionalities.

This set of changes provides a robust and flexible vision system for me, catering to different operational needs and supporting various multimodal LLMs.
2025-06-07 09:07:02 +00:00

112 lines
No EOL
4.4 KiB
JavaScript

import { Vec3 } from 'vec3';
import { Camera } from "./camera.js";
import fs from 'fs';
import path from 'path';
export class VisionInterpreter {
constructor(agent, vision_mode) {
this.agent = agent;
this.vision_mode = vision_mode;
this.fp = './bots/'+agent.name+'/screenshots/';
if (this.vision_mode !== 'off') {
this.camera = new Camera(agent.bot, this.fp);
}
}
async lookAtPlayer(player_name, direction) {
if (this.vision_mode === 'off') {
return "Vision is disabled. Use other methods to describe the environment.";
}
if (!this.camera) {
return "Camera is not initialized. Vision may be set to 'off'.";
}
if (!this.agent.prompter.vision_model.sendVisionRequest && this.vision_mode === 'prompted') {
return "Vision requests are not enabled for the current model. Cannot analyze image.";
}
let result = "";
const bot = this.agent.bot;
const player = bot.players[player_name]?.entity;
if (!player) {
return `Could not find player ${player_name}`;
}
let filename;
if (direction === 'with') {
await bot.look(player.yaw, player.pitch);
result = `Looking in the same direction as ${player_name}.\n`;
filename = await this.camera.capture();
this.agent.latestScreenshotPath = filename;
} else {
await bot.lookAt(new Vec3(player.position.x, player.position.y + player.height, player.position.z));
result = `Looking at player ${player_name}.\n`;
filename = await this.camera.capture();
this.agent.latestScreenshotPath = filename;
}
if (this.vision_mode === 'prompted') {
return result + `Image analysis: "${await this.analyzeImage(filename)}"`;
} else if (this.vision_mode === 'always') {
return result + "Screenshot taken and stored.";
}
// Should not be reached if vision_mode is one of the expected values
return "Error: Unknown vision mode.";
}
async lookAtPosition(x, y, z) {
if (this.vision_mode === 'off') {
return "Vision is disabled. Use other methods to describe the environment.";
}
if (!this.camera) {
return "Camera is not initialized. Vision may be set to 'off'.";
}
if (!this.agent.prompter.vision_model.sendVisionRequest && this.vision_mode === 'prompted') {
return "Vision requests are not enabled for the current model. Cannot analyze image.";
}
let result = "";
const bot = this.agent.bot;
await bot.lookAt(new Vec3(x, y + 2, z)); // lookAt requires y to be eye level, so +2 from feet
result = `Looking at coordinate ${x}, ${y}, ${z}.\n`;
let filename = await this.camera.capture();
this.agent.latestScreenshotPath = filename;
if (this.vision_mode === 'prompted') {
return result + `Image analysis: "${await this.analyzeImage(filename)}"`;
} else if (this.vision_mode === 'always') {
return result + "Screenshot taken and stored.";
}
// Should not be reached if vision_mode is one of the expected values
return "Error: Unknown vision mode.";
}
getCenterBlockInfo() {
const bot = this.agent.bot;
const maxDistance = 128; // Maximum distance to check for blocks
const targetBlock = bot.blockAtCursor(maxDistance);
if (targetBlock) {
return `Block at center view: ${targetBlock.name} at (${targetBlock.position.x}, ${targetBlock.position.y}, ${targetBlock.position.z})`;
} else {
return "No block in center view";
}
}
async analyzeImage(filename) {
try {
// filename already includes .jpg from camera.js
const imageFullPath = path.join(this.fp, filename);
const imageBuffer = fs.readFileSync(imageFullPath);
const messages = this.agent.history.getHistory();
const blockInfo = this.getCenterBlockInfo();
const result = await this.agent.prompter.promptVision(messages, imageBuffer);
return result + `\n${blockInfo}`;
} catch (error) {
console.warn('Error reading image:', error);
return `Error reading image: ${error.message}`;
}
}
}