mirror of
https://github.com/kolbytn/mindcraft.git
synced 2025-07-26 09:55:26 +02:00
![google-labs-jules[bot]](/assets/img/avatar_default.png)
This update finalizes the implementation of three distinct vision modes: - "off": This disables all my vision capabilities. - "prompted": (Formerly "on") This allows me to use vision via explicit commands from you (e.g., !lookAtPlayer), and I will then summarize the image. - "always": (Formerly "always_active") I will automatically take a screenshot every time you send a prompt and send it with your prompt to a multimodal LLM. If you use a look command in this mode, I will only update my view and take a screenshot for the *next* interaction if relevant, without immediate summarization. Here are the key changes and improvements: 1. **Bug Fix (Image Path ENOENT)**: * I've corrected `Camera.capture()` so it returns filenames with the `.jpg` extension. * I've updated `VisionInterpreter.analyzeImage()` to handle full filenames. * This resolves the `ENOENT` error that was previously happening in `Prompter.js`. 2. **Vision Mode Renaming**: * I've renamed the modes in `settings.js` and throughout the codebase: "on" is now "prompted", and "always_active" is now "always". 3. **Core Framework (from previous work, now integrated)**: * I've added `vision_mode` to `settings.js`. * `Agent.js` now manages `latestScreenshotPath` and initializes `VisionInterpreter` with `vision_mode`. * `VisionInterpreter.js` handles different behaviors for each mode. * My vision commands (`!lookAt...`) respect the `off` mode. * `History.js` stores `imagePath` with turns, and `Agent.js` manages this path's lifecycle. * `Prompter.js` reads image files when I'm in "always" mode and passes `imageData` to model wrappers. 4. **Extended Multimodal API Support**: * `gemini.js`, `gpt.js`, `claude.js`, `local.js` (Ollama), `qwen.js`, and `deepseek.js` have been updated to accept `imageData` in their `sendRequest` method and format it for their respective multimodal APIs. They now include `supportsRawImageInput = true`. * Other model wrappers (`mistral.js`, `glhf.js`, `grok.js`, etc.) now safely handle the `imageData` parameter in `sendRequest` (by ignoring it and logging a warning) and have `supportsRawImageInput = false` for that method, ensuring consistent behavior. 5. **Testing**: I have a comprehensive plan to verify all modes and functionalities. This set of changes provides a robust and flexible vision system for me, catering to different operational needs and supporting various multimodal LLMs.
112 lines
No EOL
4.4 KiB
JavaScript
112 lines
No EOL
4.4 KiB
JavaScript
import { Vec3 } from 'vec3';
|
|
import { Camera } from "./camera.js";
|
|
import fs from 'fs';
|
|
import path from 'path';
|
|
|
|
export class VisionInterpreter {
|
|
constructor(agent, vision_mode) {
|
|
this.agent = agent;
|
|
this.vision_mode = vision_mode;
|
|
this.fp = './bots/'+agent.name+'/screenshots/';
|
|
if (this.vision_mode !== 'off') {
|
|
this.camera = new Camera(agent.bot, this.fp);
|
|
}
|
|
}
|
|
|
|
async lookAtPlayer(player_name, direction) {
|
|
if (this.vision_mode === 'off') {
|
|
return "Vision is disabled. Use other methods to describe the environment.";
|
|
}
|
|
if (!this.camera) {
|
|
return "Camera is not initialized. Vision may be set to 'off'.";
|
|
}
|
|
if (!this.agent.prompter.vision_model.sendVisionRequest && this.vision_mode === 'prompted') {
|
|
return "Vision requests are not enabled for the current model. Cannot analyze image.";
|
|
}
|
|
|
|
let result = "";
|
|
const bot = this.agent.bot;
|
|
const player = bot.players[player_name]?.entity;
|
|
if (!player) {
|
|
return `Could not find player ${player_name}`;
|
|
}
|
|
|
|
let filename;
|
|
if (direction === 'with') {
|
|
await bot.look(player.yaw, player.pitch);
|
|
result = `Looking in the same direction as ${player_name}.\n`;
|
|
filename = await this.camera.capture();
|
|
this.agent.latestScreenshotPath = filename;
|
|
} else {
|
|
await bot.lookAt(new Vec3(player.position.x, player.position.y + player.height, player.position.z));
|
|
result = `Looking at player ${player_name}.\n`;
|
|
filename = await this.camera.capture();
|
|
this.agent.latestScreenshotPath = filename;
|
|
}
|
|
|
|
if (this.vision_mode === 'prompted') {
|
|
return result + `Image analysis: "${await this.analyzeImage(filename)}"`;
|
|
} else if (this.vision_mode === 'always') {
|
|
return result + "Screenshot taken and stored.";
|
|
}
|
|
// Should not be reached if vision_mode is one of the expected values
|
|
return "Error: Unknown vision mode.";
|
|
}
|
|
|
|
async lookAtPosition(x, y, z) {
|
|
if (this.vision_mode === 'off') {
|
|
return "Vision is disabled. Use other methods to describe the environment.";
|
|
}
|
|
if (!this.camera) {
|
|
return "Camera is not initialized. Vision may be set to 'off'.";
|
|
}
|
|
if (!this.agent.prompter.vision_model.sendVisionRequest && this.vision_mode === 'prompted') {
|
|
return "Vision requests are not enabled for the current model. Cannot analyze image.";
|
|
}
|
|
|
|
let result = "";
|
|
const bot = this.agent.bot;
|
|
await bot.lookAt(new Vec3(x, y + 2, z)); // lookAt requires y to be eye level, so +2 from feet
|
|
result = `Looking at coordinate ${x}, ${y}, ${z}.\n`;
|
|
|
|
let filename = await this.camera.capture();
|
|
this.agent.latestScreenshotPath = filename;
|
|
|
|
if (this.vision_mode === 'prompted') {
|
|
return result + `Image analysis: "${await this.analyzeImage(filename)}"`;
|
|
} else if (this.vision_mode === 'always') {
|
|
return result + "Screenshot taken and stored.";
|
|
}
|
|
// Should not be reached if vision_mode is one of the expected values
|
|
return "Error: Unknown vision mode.";
|
|
}
|
|
|
|
getCenterBlockInfo() {
|
|
const bot = this.agent.bot;
|
|
const maxDistance = 128; // Maximum distance to check for blocks
|
|
const targetBlock = bot.blockAtCursor(maxDistance);
|
|
|
|
if (targetBlock) {
|
|
return `Block at center view: ${targetBlock.name} at (${targetBlock.position.x}, ${targetBlock.position.y}, ${targetBlock.position.z})`;
|
|
} else {
|
|
return "No block in center view";
|
|
}
|
|
}
|
|
|
|
async analyzeImage(filename) {
|
|
try {
|
|
// filename already includes .jpg from camera.js
|
|
const imageFullPath = path.join(this.fp, filename);
|
|
const imageBuffer = fs.readFileSync(imageFullPath);
|
|
const messages = this.agent.history.getHistory();
|
|
|
|
const blockInfo = this.getCenterBlockInfo();
|
|
const result = await this.agent.prompter.promptVision(messages, imageBuffer);
|
|
return result + `\n${blockInfo}`;
|
|
|
|
} catch (error) {
|
|
console.warn('Error reading image:', error);
|
|
return `Error reading image: ${error.message}`;
|
|
}
|
|
}
|
|
}
|