mindcraft/src/agent/vision/vision_interpreter.js

import { Vec3 } from 'vec3';
import { Camera } from "./camera.js";
import fs from 'fs';
import path from 'path';

export class VisionInterpreter {
    constructor(agent, vision_mode) {
        this.agent = agent;
        this.vision_mode = vision_mode;
        this.fp = './bots/'+agent.name+'/screenshots/';
        if (this.vision_mode !== 'off') {
            this.camera = new Camera(agent.bot, this.fp);
        }
    }

    async lookAtPlayer(player_name, direction) {
        if (this.vision_mode === 'off') {
            return "Vision is disabled. Use other methods to describe the environment.";
        }
        if (!this.camera) {
            return "Camera is not initialized. Vision may be set to 'off'.";
        }
        if (!this.agent.prompter.vision_model.sendVisionRequest && this.vision_mode === 'prompted') {
            return "Vision requests are not enabled for the current model. Cannot analyze image.";
        }

        let result = "";
        const bot = this.agent.bot;
        const player = bot.players[player_name]?.entity;
        if (!player) {
            return `Could not find player ${player_name}`;
        }

        let filename;
        if (direction === 'with') {
            await bot.look(player.yaw, player.pitch);
            result = `Looking in the same direction as ${player_name}.\n`;
            filename = await this.camera.capture();
            this.agent.latestScreenshotPath = filename;
        } else {
            await bot.lookAt(new Vec3(player.position.x, player.position.y + player.height, player.position.z));
            result = `Looking at player ${player_name}.\n`;
            filename = await this.camera.capture();
            this.agent.latestScreenshotPath = filename;
        }

        if (this.vision_mode === 'prompted') {
            return result + `Image analysis: "${await this.analyzeImage(filename)}"`;
        } else if (this.vision_mode === 'always') {
            return result + "Screenshot taken and stored.";
        }
        // Should not be reached if vision_mode is one of the expected values
        return "Error: Unknown vision mode.";
    }

    async lookAtPosition(x, y, z) {
        if (this.vision_mode === 'off') {
            return "Vision is disabled. Use other methods to describe the environment.";
        }
        if (!this.camera) {
            return "Camera is not initialized. Vision may be set to 'off'.";
        }
        if (!this.agent.prompter.vision_model.sendVisionRequest && this.vision_mode === 'prompted') {
            return "Vision requests are not enabled for the current model. Cannot analyze image.";
        }

        let result = "";
        const bot = this.agent.bot;
        await bot.lookAt(new Vec3(x, y + 2, z)); // lookAt requires y to be eye level, so +2 from feet
        result = `Looking at coordinate ${x}, ${y}, ${z}.\n`;

        let filename = await this.camera.capture();
        this.agent.latestScreenshotPath = filename;

        if (this.vision_mode === 'prompted') {
            return result + `Image analysis: "${await this.analyzeImage(filename)}"`;
        } else if (this.vision_mode === 'always') {
            return result + "Screenshot taken and stored.";
        }
        // Should not be reached if vision_mode is one of the expected values
        return "Error: Unknown vision mode.";
    }

    getCenterBlockInfo() {
        const bot = this.agent.bot;
        const maxDistance = 128; // Maximum distance to check for blocks
        const targetBlock = bot.blockAtCursor(maxDistance);

        if (targetBlock) {
            return `Block at center view: ${targetBlock.name} at (${targetBlock.position.x}, ${targetBlock.position.y}, ${targetBlock.position.z})`;
        } else {
            return "No block in center view";
        }
    }

    async analyzeImage(filename) {
        try {
            // filename already includes .jpg from camera.js
            const imageFullPath = path.join(this.fp, filename);
            const imageBuffer = fs.readFileSync(imageFullPath);
            const messages = this.agent.history.getHistory();

            const blockInfo = this.getCenterBlockInfo();
            const result = await this.agent.prompter.promptVision(messages, imageBuffer);
            return result + `\n${blockInfo}`;

        } catch (error) {
            console.warn('Error reading image:', error);
            return `Error reading image: ${error.message}`;
        }
    }
}