feat: move vision functions from skill into vision_intepreter

This commit is contained in:
gmuffiness 2025-01-24 13:16:36 +09:00
parent e4eda9c16a
commit 5fce0acaac
8 changed files with 175 additions and 73 deletions

View file

@ -9,7 +9,7 @@
"bot_responder": "You are a minecraft bot named $NAME that is currently in conversation with another AI bot. Both of you can take actions with the !command syntax, and actions take time to complete. You are currently busy with the following action: '$ACTION' but have received a new message. Decide whether to 'respond' immediately or 'ignore' it and wait for your current action to finish. Be conservative and only respond when necessary, like when you need to change/stop your action, or convey necessary information. Example 1: You:Building a house! !newAction('Build a house.').\nOther Bot: 'Come here!'\nYour decision: ignore\nExample 2: You:Collecting dirt !collectBlocks('dirt',10).\nOther Bot: 'No, collect some wood instead.'\nYour decision: respond\nExample 3: You:Coming to you now. !goToPlayer('billy',3).\nOther Bot: 'What biome are you in?'\nYour decision: respond\nActual Conversation: $TO_SUMMARIZE\nDecide by outputting ONLY 'respond' or 'ignore', nothing else. Your decision:",
"image_conversing": "You are a playful Minecraft bot. Briefly describe the screen you are looking at now.",
"image_conversing": "Summarize the visible Minecraft screen by listing the types and arrangements of blocks, focusing on terrain, structures, and notable features in 500 chars.",
"modes": {
"self_preservation": true,

View file

@ -32,6 +32,7 @@ export default
"show_bot_views": false, // show bot's view in browser at localhost:3000, 3001...
"allow_insecure_coding": false, // allows newAction command and model can write/run code on your computer. enable at own risk
"allow_vision": true, // allows vision model to interpret screenshots as inputs
"code_timeout_mins": 10, // minutes code is allowed to run. -1 for no timeout
"max_messages": 15, // max number of messages to keep in context

View file

@ -1,5 +1,6 @@
import { History } from './history.js';
import { Coder } from './coder.js';
import { VisionInterpreter } from './vision_interpreter.js';
import { Prompter } from './prompter.js';
import { initModes } from './modes.js';
import { initBot } from '../utils/mcdata.js';
@ -36,6 +37,8 @@ export class Agent {
this.history = new History(this);
console.log('Initializing coder...');
this.coder = new Coder(this);
console.log('Initializing vision intepreter...');
this.vision_interpreter = new VisionInterpreter(this, settings.allow_vision);
console.log('Initializing npc controller...');
this.npc = new NPCContoller(this);
console.log('Initializing memory bank...');

View file

@ -422,7 +422,7 @@ export const actionsList = [
}
},
perform: runAsAction(async (agent, player_name, direction) => {
await skills.lookAtPlayer(agent, agent.bot, player_name, direction);
await agent.vision_interpreter.lookAtPlayer(player_name, direction);
})
},
{
@ -434,7 +434,7 @@ export const actionsList = [
'z': { type: 'int', description: 'z coordinate' }
},
perform: runAsAction(async (agent, x, y, z) => {
await skills.lookAtPosition(agent, agent.bot, x, y, z);
await agent.vision_interpreter.lookAtPosition(x, y, z);
})
}
];

View file

@ -1343,76 +1343,76 @@ export async function activateNearestBlock(bot, type) {
return true;
}
export async function lookAtPlayer(agent, bot, player_name, direction) {
/**
* Look at a player or look in the same direction as the player
* @param {MinecraftBot} bot reference to the minecraft bot
* @param {string} player_name name of the target player
* @param {string} direction 'at' to look at player, 'with' to look in same direction
* @returns {Promise<boolean>} whether the look action was successful
* @example
* await skills.lookAtPlayer(bot, "player1", "at");
* await skills.lookAtPlayer(bot, "player1", "with");
**/
// export async function lookAtPlayer(agent, bot, player_name, direction) {
// /**
// * Look at a player or look in the same direction as the player
// * @param {MinecraftBot} bot reference to the minecraft bot
// * @param {string} player_name name of the target player
// * @param {string} direction 'at' to look at player, 'with' to look in same direction
// * @returns {Promise<boolean>} whether the look action was successful
// * @example
// * await skills.lookAtPlayer(bot, "player1", "at");
// * await skills.lookAtPlayer(bot, "player1", "with");
// **/
const player = bot.players[player_name]?.entity;
if (!player) {
log(bot, `Could not find player ${player_name}`);
return false;
}
// const player = bot.players[player_name]?.entity;
// if (!player) {
// log(bot, `Could not find player ${player_name}`);
// return false;
// }
let filename;
if (direction === 'with') {
// Copy player's view direction
await bot.look(player.yaw, player.pitch);
const camera = new Camera(bot);
await new Promise(resolve => setTimeout(resolve, 500));
log(bot, `Looking in the same direction as ${player_name}`);
// let filename;
// if (direction === 'with') {
// // Copy player's view direction
// await bot.look(player.yaw, player.pitch);
// const camera = new Camera(bot);
// await new Promise(resolve => setTimeout(resolve, 500));
// log(bot, `Looking in the same direction as ${player_name}`);
filename = await camera.capture();
console.log(player.yaw, player.pitch);
// log(bot, `Screenshot saved: bots/${bot.username}/screenshots/${filename}.jpg`);
// filename = await camera.capture();
// console.log(player.yaw, player.pitch);
// // log(bot, `Screenshot saved: bots/${bot.username}/screenshots/${filename}.jpg`);
} else {
// Look at player's position
await bot.lookAt(new Vec3(player.position.x, player.position.y + player.height, player.position.z));
const camera = new Camera(bot);
await new Promise(resolve => setTimeout(resolve, 500));
log(bot, `Looking at player ${player_name}`);
// } else {
// // Look at player's position
// await bot.lookAt(new Vec3(player.position.x, player.position.y + player.height, player.position.z));
// const camera = new Camera(bot);
// await new Promise(resolve => setTimeout(resolve, 500));
// log(bot, `Looking at player ${player_name}`);
filename = await camera.capture();
// log(bot, `Screenshot saved: bots/${bot.username}/screenshots/${filename}.jpg`);
// log(bot, `Target coordinates: x:${player.position.x}, y:${player.position.y}, z:${player.position.z}`);
}
// filename = await camera.capture();
// // log(bot, `Screenshot saved: bots/${bot.username}/screenshots/${filename}.jpg`);
// // log(bot, `Target coordinates: x:${player.position.x}, y:${player.position.y}, z:${player.position.z}`);
// }
try {
const imageBuffer = fs.readFileSync(`bots/${bot.username}/screenshots/${filename}.jpg`);
const messages = agent.history.getHistory();
let res = await agent.prompter.promptImageConvo(messages, imageBuffer);
log(bot, res);
return true;
} catch (error) {
log(bot, `Error analyzing image: ${error.message}`);
return false;
}
}
// try {
// const imageBuffer = fs.readFileSync(`bots/${bot.username}/screenshots/${filename}.jpg`);
// const messages = agent.history.getHistory();
// let res = await agent.prompter.promptImageConvo(messages, imageBuffer);
// log(bot, res);
// return true;
// } catch (error) {
// log(bot, `Error analyzing image: ${error.message}`);
// return false;
// }
// }
export async function lookAtPosition(agent, bot, x, y, z) {
await bot.lookAt(new Vec3(x, y + 2, z));
const camera = new Camera(bot);
await new Promise(resolve => setTimeout(resolve, 500));
log(bot, `Looking at coordinate ${x, y, z}`);
// export async function lookAtPosition(agent, bot, x, y, z) {
// await bot.lookAt(new Vec3(x, y + 2, z));
// const camera = new Camera(bot);
// await new Promise(resolve => setTimeout(resolve, 500));
// log(bot, `Looking at coordinate ${x, y, z}`);
let filename = await camera.capture();
// let filename = await camera.capture();
try {
const imageBuffer = fs.readFileSync(`bots/${bot.username}/screenshots/${filename}.jpg`);
const messages = agent.history.getHistory();
let res = await agent.prompter.promptImageConvo(messages, imageBuffer);
log(bot, res);
return true;
} catch (error) {
log(bot, `Error analyzing image: ${error.message}`);
return false;
}
}
// try {
// const imageBuffer = fs.readFileSync(`bots/${bot.username}/screenshots/${filename}.jpg`);
// const messages = agent.history.getHistory();
// let res = await agent.prompter.promptImageConvo(messages, imageBuffer);
// log(bot, res);
// return true;
// } catch (error) {
// log(bot, `Error analyzing image: ${error.message}`);
// return false;
// }
// }

View file

@ -271,7 +271,7 @@ export class Prompter {
imageMessages.push({
role: "user",
content: [
{ type: "text", text: "Briefly describe the screen you are looking at now." },
{ type: "text", text: prompt },
{
type: "image_url",
image_url: {
@ -299,7 +299,7 @@ export class Prompter {
let resp = await this.chat_model.sendRequest(messages, prompt);
this.awaiting_coding = false;
return resp;
}
git }
async promptMemSaving(to_summarize) {
await this.checkCooldown();

View file

@ -0,0 +1,95 @@
import { Vec3 } from 'vec3';
import { Camera } from "../utils/camera.js";
import fs from 'fs';
import { log } from './library/skills.js';
import * as world from './library/world.js';
const pad = (str) => {
return '\n' + str + '\n';
}
export class VisionInterpreter {
constructor(agent, allow_vision) {
this.agent = agent;
this.allow_vision = allow_vision;
this.fp = './bots/'+agent.name+'/screenshots/';
}
async lookAtPlayer(player_name, direction) {
const bot = this.agent.bot;
const player = bot.players[player_name]?.entity;
if (!player) {
log(bot, `Could not find player ${player_name}`);
}
let filename;
if (direction === 'with') {
await bot.look(player.yaw, player.pitch);
const camera = new Camera(bot, this.fp);
await new Promise(resolve => setTimeout(resolve, 500));
log(bot, `Looking in the same direction as ${player_name}`);
filename = await camera.capture();
} else {
await bot.lookAt(new Vec3(player.position.x, player.position.y + player.height, player.position.z));
const camera = new Camera(bot, this.fp);
await new Promise(resolve => setTimeout(resolve, 500));
log(bot, `Looking at player ${player_name}`);
filename = await camera.capture();
}
if (!this.allow_vision) {
log(this.agent.bot, "Vision is disabled. Using text-based environment description instead.");
log(this.agent.bot, this._nearbyBlocks());
} else {
await this.analyzeImage(filename);
}
}
async lookAtPosition(x, y, z) {
const bot = this.agent.bot;
await bot.lookAt(new Vec3(x, y + 2, z));
const camera = new Camera(bot, this.fp);
await new Promise(resolve => setTimeout(resolve, 500));
log(bot, `Looking at coordinate ${x, y, z}`);
let filename = await camera.capture();
if (!this.allow_vision) {
log(this.agent.bot, "Vision is disabled. Using text-based environment description instead.");
log(this.agent.bot, this._nearbyBlocks());
} else {
await this.analyzeImage(filename);
}
}
async analyzeImage(filename) {
let res = null;
try {
const bot = this.agent.bot;
const imageBuffer = fs.readFileSync(`${this.fp}/${filename}.jpg`);
const messages = this.agent.history.getHistory();
res = await this.agent.prompter.promptImageConvo(messages, imageBuffer);
log(bot, res);
} catch (error) {
log(this.agent.bot, `Error analyzing image: ${error.message}`);
}
}
_nearbyBlocks() {
const bot = this.agent.bot;
let res = 'NEARBY_BLOCKS';
let blocks = world.getNearbyBlockTypes(bot);
for (let i = 0; i < blocks.length; i++) {
res += `\n- ${blocks[i]}`;
}
if (blocks.length == 0) {
res += ': none';
} else {
// Environmental Awareness
res += '\n- ' + world.getSurroundingBlocks(bot).join('\n- ')
res += `\n- First Solid Block Above Head: ${world.getFirstBlockAboveHead(bot, null, 32)}`;
}
return pad(res);
}
}

View file

@ -13,9 +13,10 @@ global.Worker = worker_threads.Worker;
export class Camera extends EventEmitter {
constructor (bot) {
constructor (bot, fp) {
super()
this.bot = bot
this.fp = fp
this.viewDistance = 4
this.width = 800
this.height = 512
@ -42,6 +43,8 @@ export class Camera extends EventEmitter {
}
async capture() {
// waits some time helps renderer to render the world view
await new Promise(resolve => setTimeout(resolve, 1000));
this.renderer.render(this.viewer.scene, this.viewer.camera);
const imageStream = this.canvas.createJPEGStream({
@ -55,7 +58,7 @@ export class Camera extends EventEmitter {
const buf = await getBufferFromStream(imageStream);
await this._ensureScreenshotDirectory();
await fs.writeFile(`bots/${this.bot.username}/screenshots/${filename}.jpg`, buf);
await fs.writeFile(`${this.fp}/${filename}.jpg`, buf);
console.log('saved', filename);
return filename;
}
@ -63,10 +66,10 @@ export class Camera extends EventEmitter {
async _ensureScreenshotDirectory() {
let stats;
try {
stats = await fs.stat(`bots/${this.bot.username}/screenshots`);
stats = await fs.stat(this.fp);
} catch (e) {
if (!stats?.isDirectory()) {
await fs.mkdir(`bots/${this.bot.username}/screenshots`);
await fs.mkdir(this.fp);
}
}
}