From 1be24f4867c2a49d01c591fe2bab8ad4f07a007e Mon Sep 17 00:00:00 2001 From: gmuffiness Date: Wed, 15 Jan 2025 17:26:13 +0900 Subject: [PATCH 01/27] feat: add screenshots and look action works on gpt --- package.json | 4 +- settings.js | 4 +- src/agent/commands/actions.js | 55 +++++++++++++++++++++++++ src/agent/library/skills.js | 77 +++++++++++++++++++++++++++++++++++ 4 files changed, 137 insertions(+), 3 deletions(-) diff --git a/package.json b/package.json index 689f8db..00901de 100644 --- a/package.json +++ b/package.json @@ -24,7 +24,9 @@ "yargs": "^17.7.2", "socket.io": "^4.7.2", "socket.io-client": "^4.7.2", - "express": "^4.18.2" + "express": "^4.18.2", + "three": "0.128.0", + "node-canvas-webgl": "PrismarineJS/node-canvas-webgl" }, "scripts": { "postinstall": "patch-package", diff --git a/settings.js b/settings.js index a4681fa..f8dc1ba 100644 --- a/settings.js +++ b/settings.js @@ -2,7 +2,7 @@ export default { "minecraft_version": "1.20.4", // supports up to 1.21.1 "host": "127.0.0.1", // or "localhost", "your.ip.address.here" - "port": 55916, + "port": 56069, "auth": "offline", // or "microsoft" // the mindserver manages all agents and hosts the UI @@ -25,7 +25,7 @@ export default // using more than 1 profile requires you to /msg each bot indivually ], "load_memory": false, // load memory from previous session - "init_message": "Respond with hello world and your name", // sends to all on spawn + // "init_message": "Respond with hello world and your name", // sends to all on spawn "only_chat_with": [], // users that the bots listen to and send general messages to. if empty it will chat publicly "language": "en", // translate to/from this language. Supports these language names: https://cloud.google.com/translate/docs/languages diff --git a/src/agent/commands/actions.js b/src/agent/commands/actions.js index 34e6693..1c6bbfe 100644 --- a/src/agent/commands/actions.js +++ b/src/agent/commands/actions.js @@ -1,6 +1,8 @@ import * as skills from '../library/skills.js'; import settings from '../../../settings.js'; import convoManager from '../conversation.js'; +import fs from 'fs'; +import { GPT } from '../../models/gpt.js'; function runAsAction (actionFn, resume = false, timeout = -1) { let actionLabel = null; // Will be set on first use @@ -407,6 +409,59 @@ export const actionsList = [ return `Converstaion with ${player_name} ended.`; } }, + { + name: '!takeScreenshot', + description: 'Takes and saves a screenshot of the specified coordinates.', + params: { + 'x': { + type: 'int', + description: 'x coordinate to capture', + optional: true + }, + 'y': { + type: 'int', + description: 'y coordinate to capture', + optional: true + }, + 'z': { + type: 'int', + description: 'z coordinate to capture', + optional: true + }, + 'filename': { + type: 'string', + description: 'Filename to save (without extension). If not specified, saves with timestamp.', + optional: true + } + }, + perform: runAsAction(async (agent, x, y, z, filename) => { + await skills.takeScreenshot(agent.bot, x, y, z, filename); + }) + }, + { + name: '!look', + description: 'Takes a screenshot of specified coordinates and analyzes its contents.', + params: { + 'x': { + type: 'int', + description: 'x coordinate to look at', + optional: true + }, + 'y': { + type: 'int', + description: 'y coordinate to look at', + optional: true + }, + 'z': { + type: 'int', + description: 'z coordinate to look at', + optional: true + } + }, + perform: runAsAction(async (agent, x, y, z) => { + await skills.look(agent, x, y, z); + }) + }, // { // commented for now, causes confusion with goal command // name: '!npcGoal', // description: 'Set a simple goal for an item or building to automatically work towards. Do not use for complex goals.', diff --git a/src/agent/library/skills.js b/src/agent/library/skills.js index be5882f..e492d16 100644 --- a/src/agent/library/skills.js +++ b/src/agent/library/skills.js @@ -2,6 +2,8 @@ import * as mc from "../../utils/mcdata.js"; import * as world from "./world.js"; import pf from 'mineflayer-pathfinder'; import Vec3 from 'vec3'; +import fs from 'fs'; +import { Camera } from "../../utils/camera.js"; export function log(bot, message) { @@ -1340,3 +1342,78 @@ export async function activateNearestBlock(bot, type) { log(bot, `Activated ${type} at x:${block.position.x.toFixed(1)}, y:${block.position.y.toFixed(1)}, z:${block.position.z.toFixed(1)}.`); return true; } + +export async function takeScreenshot(bot, x, y, z, filename=null) { + /** + * Takes a screenshot from the bot's current view or specified position + * @param {MinecraftBot} bot, reference to the minecraft bot + * @param {int} x x coordinate to look at (optional) + * @param {int} y y coordinate to look at (optional) + * @param {int} z z coordinate to look at (optional) + * @param {string} filename filename to save (without extension). If not specified, saves with timestamp + * @returns {Promise} whether the screenshot was successful + * @example + * await skills.takeScreenshot(bot, { name: 'my_screenshot', x: 100, y: 65, z: -200 }); + **/ + + try { + bot.camera = new Camera(bot); + await new Promise(resolve => bot.camera.once('ready', resolve)); + + await bot.lookAt(new Vec3(x, y, z)); + await new Promise(resolve => setTimeout(resolve, 500)); + + const timestamp = new Date().toISOString().replace(/[:.]/g, '-'); + if (filename === null) { + filename = `screenshot_${timestamp}`; + } + await bot.camera.takePicture(filename, x, y, z); + + log(bot, `Screenshot saved: bots/${bot.username}/screenshots/${filename}.jpg`); + log(bot, `Target coordinates: x:${x}, y:${y}, z:${z}`); + return [true, filename]; + } catch (err) { + log(bot, `Failed to take screenshot: ${err.message}`); + return [false, null]; + } +} + +export async function look(agent, x, y, z) { + const bot = agent.bot; + const history = agent.history; + + const [success, filename] = await takeScreenshot(bot, x, y, z); + if (!success) { + log(bot, `Failed to take screenshot: ${filename}`); + return false; + } + + try { + const imageBuffer = fs.readFileSync(`bots/${bot.username}/screenshots/${filename}.jpg`); + const base64Image = imageBuffer.toString('base64'); + + let messages = history.getHistory(); + messages.push({ + role: "user", + content: [ + { type: "text", text: "Briefly describe the screen you are looking at now." }, + { + type: "image_url", + image_url: { + "url": `data:image/jpeg;base64,${base64Image}`, + } + } + ] + }); + console.log(messages); + + let res = await agent.prompter.chat_model.sendRequest(messages, `You are a playful Minecraft bot. Briefly describe the screen you are looking at now.`); + console.log(res); + + log(bot, res); + return true; + } catch (error) { + log(bot, `Error analyzing image: ${error.message}`); + return false; + } +} \ No newline at end of file From f5923db43a67c6bd46b604ab92b79142590b988e Mon Sep 17 00:00:00 2001 From: gmuffiness Date: Thu, 16 Jan 2025 13:17:39 +0900 Subject: [PATCH 02/27] feat: update skill look to lookAtPlayer & export lookAtPosition --- package.json | 4 +- profiles/_default.json | 20 ++++-- src/agent/commands/actions.js | 74 +++++---------------- src/agent/library/skills.js | 119 +++++++++++++++++----------------- src/agent/prompter.js | 23 +++++++ 5 files changed, 116 insertions(+), 124 deletions(-) diff --git a/package.json b/package.json index 00901de..7cb599c 100644 --- a/package.json +++ b/package.json @@ -25,8 +25,8 @@ "socket.io": "^4.7.2", "socket.io-client": "^4.7.2", "express": "^4.18.2", - "three": "0.128.0", - "node-canvas-webgl": "PrismarineJS/node-canvas-webgl" + "node-canvas-webgl": "^0.2.6", + "three": "^0.128.0" }, "scripts": { "postinstall": "patch-package", diff --git a/profiles/_default.json b/profiles/_default.json index ac244fc..90a1621 100644 --- a/profiles/_default.json +++ b/profiles/_default.json @@ -9,6 +9,8 @@ "bot_responder": "You are a minecraft bot named $NAME that is currently in conversation with another AI bot. Both of you can take actions with the !command syntax, and actions take time to complete. You are currently busy with the following action: '$ACTION' but have received a new message. Decide whether to 'respond' immediately or 'ignore' it and wait for your current action to finish. Be conservative and only respond when necessary, like when you need to change/stop your action, or convey necessary information. Example 1: You:Building a house! !newAction('Build a house.').\nOther Bot: 'Come here!'\nYour decision: ignore\nExample 2: You:Collecting dirt !collectBlocks('dirt',10).\nOther Bot: 'No, collect some wood instead.'\nYour decision: respond\nExample 3: You:Coming to you now. !goToPlayer('billy',3).\nOther Bot: 'What biome are you in?'\nYour decision: respond\nActual Conversation: $TO_SUMMARIZE\nDecide by outputting ONLY 'respond' or 'ignore', nothing else. Your decision:", + "image_conversing": "You are a playful Minecraft bot. Briefly describe the screen you are looking at now.", + "modes": { "self_preservation": true, "unstuck": true, @@ -65,14 +67,20 @@ {"role": "user", "content": "derg: (FROM OTHER BOT)Thanks, see you later!"}, {"role": "assistant", "content": "See you later. !endConversation(\"derg\")"} ], - + [ - {"role": "user", "content": "grombo_Xx: What do you see?"}, - {"role": "assistant", "content": "Let me see... !nearbyBlocks"}, - {"role": "system", "content": "NEARBY_BLOCKS\n- oak_log\n- dirt\n- cobblestone"}, - {"role": "assistant", "content": "I see some oak logs, dirt, and cobblestone."} + {"role": "user", "content": "d7297: look here"}, + {"role": "assistant", "content": "Okay! !lookAtPlayer(\"d7297\", \"at\")"}, + {"role": "system", "content": "Code Output:\nLooking at player d7297. There is a giant waterfall behind d7297."}, + {"role": "assistant", "content": "Woo, that is a cool waterfall!"} ], - + [ + {"role": "user", "content": "d7297: look there"}, + {"role": "assistant", "content": "Okay! !lookAtPlayer(\"d7297\", \"with\")"}, + {"role": "system", "content": "Code Output:\nLooking in the same direction as d7297\nThere is a castle made of stone."}, + {"role": "assistant", "content": "Wow, that stone castle looks amazing!"} + ], + [ {"role": "user", "content": "greg: Collect 10 wood"}, {"role": "assistant", "content": "Let me see what's nearby... !nearbyBlocks"}, diff --git a/src/agent/commands/actions.js b/src/agent/commands/actions.js index 1c6bbfe..09f6369 100644 --- a/src/agent/commands/actions.js +++ b/src/agent/commands/actions.js @@ -1,8 +1,6 @@ import * as skills from '../library/skills.js'; import settings from '../../../settings.js'; import convoManager from '../conversation.js'; -import fs from 'fs'; -import { GPT } from '../../models/gpt.js'; function runAsAction (actionFn, resume = false, timeout = -1) { let actionLabel = null; // Will be set on first use @@ -410,69 +408,33 @@ export const actionsList = [ } }, { - name: '!takeScreenshot', - description: 'Takes and saves a screenshot of the specified coordinates.', + name: '!lookAtPlayer', + description: 'Look at a player or look in the same direction as the player.', params: { - 'x': { - type: 'int', - description: 'x coordinate to capture', - optional: true + 'player_name': { + type: 'string', + description: 'Name of the target player' }, - 'y': { - type: 'int', - description: 'y coordinate to capture', - optional: true - }, - 'z': { - type: 'int', - description: 'z coordinate to capture', - optional: true - }, - 'filename': { - type: 'string', - description: 'Filename to save (without extension). If not specified, saves with timestamp.', - optional: true + 'direction': { + type: 'string', + description: 'How to look ("at": look at the player, "with": look in the same direction as the player)', + enum: ['at', 'with'] } }, - perform: runAsAction(async (agent, x, y, z, filename) => { - await skills.takeScreenshot(agent.bot, x, y, z, filename); + perform: runAsAction(async (agent, player_name, direction) => { + await skills.lookAtPlayer(agent, agent.bot, player_name, direction); }) }, { - name: '!look', - description: 'Takes a screenshot of specified coordinates and analyzes its contents.', + name: '!lookAtPosition', + description: 'Look at specified coordinates.', params: { - 'x': { - type: 'int', - description: 'x coordinate to look at', - optional: true - }, - 'y': { - type: 'int', - description: 'y coordinate to look at', - optional: true - }, - 'z': { - type: 'int', - description: 'z coordinate to look at', - optional: true - } + 'x': { type: 'int', description: 'x coordinate' }, + 'y': { type: 'int', description: 'y coordinate' }, + 'z': { type: 'int', description: 'z coordinate' } }, perform: runAsAction(async (agent, x, y, z) => { - await skills.look(agent, x, y, z); + await skills.lookAtPosition(agent, agent.bot, x, y, z); }) - }, - // { // commented for now, causes confusion with goal command - // name: '!npcGoal', - // description: 'Set a simple goal for an item or building to automatically work towards. Do not use for complex goals.', - // params: { - // 'name': { type: 'string', description: 'The name of the goal to set. Can be item or building name. If empty will automatically choose a goal.' }, - // 'quantity': { type: 'int', description: 'The quantity of the goal to set. Default is 1.', domain: [1, Number.MAX_SAFE_INTEGER] } - // }, - // perform: async function (agent, name=null, quantity=1) { - // await agent.npc.setGoal(name, quantity); - // agent.bot.emit('idle'); // to trigger the goal - // return 'Set npc goal: ' + agent.npc.data.curr_goal.name; - // } - // }, + } ]; diff --git a/src/agent/library/skills.js b/src/agent/library/skills.js index e492d16..389540a 100644 --- a/src/agent/library/skills.js +++ b/src/agent/library/skills.js @@ -1,9 +1,9 @@ import * as mc from "../../utils/mcdata.js"; +import { Camera } from "../../utils/camera.js"; import * as world from "./world.js"; import pf from 'mineflayer-pathfinder'; import Vec3 from 'vec3'; import fs from 'fs'; -import { Camera } from "../../utils/camera.js"; export function log(bot, message) { @@ -1343,77 +1343,76 @@ export async function activateNearestBlock(bot, type) { return true; } -export async function takeScreenshot(bot, x, y, z, filename=null) { +export async function lookAtPlayer(agent, bot, player_name, direction) { /** - * Takes a screenshot from the bot's current view or specified position - * @param {MinecraftBot} bot, reference to the minecraft bot - * @param {int} x x coordinate to look at (optional) - * @param {int} y y coordinate to look at (optional) - * @param {int} z z coordinate to look at (optional) - * @param {string} filename filename to save (without extension). If not specified, saves with timestamp - * @returns {Promise} whether the screenshot was successful + * Look at a player or look in the same direction as the player + * @param {MinecraftBot} bot reference to the minecraft bot + * @param {string} player_name name of the target player + * @param {string} direction 'at' to look at player, 'with' to look in same direction + * @returns {Promise} whether the look action was successful * @example - * await skills.takeScreenshot(bot, { name: 'my_screenshot', x: 100, y: 65, z: -200 }); + * await skills.lookAtPlayer(bot, "player1", "at"); + * await skills.lookAtPlayer(bot, "player1", "with"); **/ - - try { - bot.camera = new Camera(bot); - await new Promise(resolve => bot.camera.once('ready', resolve)); - await bot.lookAt(new Vec3(x, y, z)); - await new Promise(resolve => setTimeout(resolve, 500)); - - const timestamp = new Date().toISOString().replace(/[:.]/g, '-'); - if (filename === null) { - filename = `screenshot_${timestamp}`; - } - await bot.camera.takePicture(filename, x, y, z); - - log(bot, `Screenshot saved: bots/${bot.username}/screenshots/${filename}.jpg`); - log(bot, `Target coordinates: x:${x}, y:${y}, z:${z}`); - return [true, filename]; - } catch (err) { - log(bot, `Failed to take screenshot: ${err.message}`); - return [false, null]; - } -} - -export async function look(agent, x, y, z) { - const bot = agent.bot; - const history = agent.history; - - const [success, filename] = await takeScreenshot(bot, x, y, z); - if (!success) { - log(bot, `Failed to take screenshot: ${filename}`); + const player = bot.players[player_name]?.entity; + if (!player) { + log(bot, `Could not find player ${player_name}`); return false; } + let filename; + if (direction === 'with') { + // Copy player's view direction + await bot.look(player.yaw, player.pitch); + const camera = new Camera(bot); + await new Promise(resolve => setTimeout(resolve, 500)); + log(bot, `Looking in the same direction as ${player_name}`); + + filename = await camera.captureDirection(player.yaw, player.pitch); + console.log(player.yaw, player.pitch); + // log(bot, `Screenshot saved: bots/${bot.username}/screenshots/${filename}.jpg`); + + } else { + // Look at player's position + await bot.lookAt(new Vec3(player.position.x, player.position.y + player.height, player.position.z)); + const camera = new Camera(bot); + await new Promise(resolve => setTimeout(resolve, 500)); + log(bot, `Looking at player ${player_name}`); + + filename = await camera.captureTargetPoint(player.position.x, player.position.y + player.height, player.position.z); + // log(bot, `Screenshot saved: bots/${bot.username}/screenshots/${filename}.jpg`); + // log(bot, `Target coordinates: x:${player.position.x}, y:${player.position.y}, z:${player.position.z}`); + } + try { const imageBuffer = fs.readFileSync(`bots/${bot.username}/screenshots/${filename}.jpg`); - const base64Image = imageBuffer.toString('base64'); - - let messages = history.getHistory(); - messages.push({ - role: "user", - content: [ - { type: "text", text: "Briefly describe the screen you are looking at now." }, - { - type: "image_url", - image_url: { - "url": `data:image/jpeg;base64,${base64Image}`, - } - } - ] - }); - console.log(messages); - - let res = await agent.prompter.chat_model.sendRequest(messages, `You are a playful Minecraft bot. Briefly describe the screen you are looking at now.`); - console.log(res); - + const messages = agent.history.getHistory(); + let res = await agent.prompter.promptImageConvo(messages, imageBuffer); log(bot, res); return true; } catch (error) { log(bot, `Error analyzing image: ${error.message}`); return false; } -} \ No newline at end of file +} + +export async function lookAtPosition(agent, bot, x, y, z) { + await bot.lookAt(new Vec3(x, y + 2, z)); + const camera = new Camera(bot); + await new Promise(resolve => setTimeout(resolve, 500)); + log(bot, `Looking at coordinate ${x, y, z}`); + + let filename = await camera.captureTargetPoint(x, y + 2, z); + + try { + const imageBuffer = fs.readFileSync(`bots/${bot.username}/screenshots/${filename}.jpg`); + const messages = agent.history.getHistory(); + let res = await agent.prompter.promptImageConvo(messages, imageBuffer); + log(bot, res); + return true; + } catch (error) { + log(bot, `Error analyzing image: ${error.message}`); + return false; + } +} diff --git a/src/agent/prompter.js b/src/agent/prompter.js index 310ca3e..11ae554 100644 --- a/src/agent/prompter.js +++ b/src/agent/prompter.js @@ -264,6 +264,29 @@ export class Prompter { return ''; } + async promptImageConvo(messages, imageBuffer) { + await this.checkCooldown(); + let prompt = this.profile.image_conversing; + let imageMessages = [...messages]; + imageMessages.push({ + role: "user", + content: [ + { type: "text", text: "Briefly describe the screen you are looking at now." }, + { + type: "image_url", + image_url: { + "url": `data:image/jpeg;base64,${imageBuffer.toString('base64')}`, + } + } + ] + }); + + return await this.chat_model.sendRequest( + imageMessages, + prompt + ); + } + async promptCoding(messages) { if (this.awaiting_coding) { console.warn('Already awaiting coding response, returning no response.'); From 1fee081782d424234c76639c78c71875f69eaf06 Mon Sep 17 00:00:00 2001 From: gmuffiness Date: Thu, 16 Jan 2025 14:43:13 +0900 Subject: [PATCH 03/27] fix: add camera file & move image describe to promptImageConvo --- profiles/_default.json | 6 +++ src/agent/prompter.js | 23 +++++++++++ src/utils/camera.js | 88 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 117 insertions(+) create mode 100644 src/utils/camera.js diff --git a/profiles/_default.json b/profiles/_default.json index 90a1621..f0ed592 100644 --- a/profiles/_default.json +++ b/profiles/_default.json @@ -80,6 +80,12 @@ {"role": "system", "content": "Code Output:\nLooking in the same direction as d7297\nThere is a castle made of stone."}, {"role": "assistant", "content": "Wow, that stone castle looks amazing!"} ], + [ + {"role": "user", "content": "d7297: look 0 70 12 and describe what is there"}, + {"role": "assistant", "content": "Sure! !lookAtPosition(0, 70, 12)"}, + {"role": "system", "content": "Code Output:\nLooking in the same direction as d7297\nThere is a water fountain."}, + {"role": "assistant", "content": "I see the water fountain! amazing!"} + ], [ {"role": "user", "content": "greg: Collect 10 wood"}, diff --git a/src/agent/prompter.js b/src/agent/prompter.js index 11ae554..4c7507d 100644 --- a/src/agent/prompter.js +++ b/src/agent/prompter.js @@ -287,6 +287,29 @@ export class Prompter { ); } + async promptImageConvo(messages, imageBuffer) { + await this.checkCooldown(); + let prompt = this.profile.image_conversing; + let imageMessages = [...messages]; + imageMessages.push({ + role: "user", + content: [ + { type: "text", text: "Briefly describe the screen you are looking at now." }, + { + type: "image_url", + image_url: { + "url": `data:image/jpeg;base64,${imageBuffer.toString('base64')}`, + } + } + ] + }); + + return await this.chat_model.sendRequest( + imageMessages, + prompt + ); + } + async promptCoding(messages) { if (this.awaiting_coding) { console.warn('Already awaiting coding response, returning no response.'); diff --git a/src/utils/camera.js b/src/utils/camera.js new file mode 100644 index 0000000..54a0c7e --- /dev/null +++ b/src/utils/camera.js @@ -0,0 +1,88 @@ +import { Viewer } from 'prismarine-viewer/viewer/lib/viewer.js'; +import { WorldView } from 'prismarine-viewer/viewer/lib/worldview.js'; +import { getBufferFromStream } from 'prismarine-viewer/viewer/lib/simpleUtils.js'; + +import THREE from 'three'; +import { createCanvas } from 'node-canvas-webgl/lib/index.js'; +import fs from 'fs/promises'; +import { Vec3 } from 'vec3'; +import { EventEmitter } from 'events'; + +import worker_threads from 'worker_threads'; +global.Worker = worker_threads.Worker; + + +export class Camera extends EventEmitter { + constructor (bot) { + super() + this.bot = bot + this.viewDistance = 4 + this.width = 800 + this.height = 512 + this.canvas = createCanvas(this.width, this.height) + this.renderer = new THREE.WebGLRenderer({ canvas: this.canvas }) + this.viewer = new Viewer(this.renderer) + this._init().then(() => { + this.emit('ready') + }) + } + + async _init () { + const botPos = this.bot.entity.position + const center = new Vec3(botPos.x, botPos.y+this.bot.entity.height, botPos.z) + this.viewer.setVersion(this.bot.version) + // Load world + const worldView = new WorldView(this.bot.world, this.viewDistance, center) + this.viewer.listen(worldView) + + this.viewer.camera.position.set(center.x, center.y, center.z) + + await worldView.init(center) + } + + async captureTargetPoint(x, y, z) { + this.viewer.camera.lookAt(x, y, z); + const filename = await this._capture(); + return filename; + } + + async captureDirection(yaw, pitch) { + this.viewer.camera.rotation.y = yaw; + this.viewer.camera.rotation.x = pitch; + const filename = await this._capture(); + return filename; + } + + async _capture() { + console.info('Waiting for camera to load'); + await new Promise(resolve => setTimeout(resolve, 5000)); + this.renderer.render(this.viewer.scene, this.viewer.camera); + + const imageStream = this.canvas.createJPEGStream({ + bufsize: 4096, + quality: 100, + progressive: false + }); + + const timestamp = new Date().toISOString().replace(/[:.]/g, '-'); + const filename = `screenshot_${timestamp}`; + + const buf = await getBufferFromStream(imageStream); + await this._ensureScreenshotDirectory(); + await fs.writeFile(`bots/${this.bot.username}/screenshots/${filename}.jpg`, buf); + console.log('saved', filename); + return filename; + } + + async _ensureScreenshotDirectory() { + let stats; + try { + stats = await fs.stat(`bots/${this.bot.username}/screenshots`); + } catch (e) { + if (!stats?.isDirectory()) { + await fs.mkdir(`bots/${this.bot.username}/screenshots`); + } + } + } +} + \ No newline at end of file From 85ed526fcf65c8e6b09fa59e30f714f2c73d70cb Mon Sep 17 00:00:00 2001 From: gmuffiness Date: Mon, 20 Jan 2025 10:30:47 +0900 Subject: [PATCH 04/27] chore: reset settings --- settings.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/settings.js b/settings.js index f8dc1ba..a4681fa 100644 --- a/settings.js +++ b/settings.js @@ -2,7 +2,7 @@ export default { "minecraft_version": "1.20.4", // supports up to 1.21.1 "host": "127.0.0.1", // or "localhost", "your.ip.address.here" - "port": 56069, + "port": 55916, "auth": "offline", // or "microsoft" // the mindserver manages all agents and hosts the UI @@ -25,7 +25,7 @@ export default // using more than 1 profile requires you to /msg each bot indivually ], "load_memory": false, // load memory from previous session - // "init_message": "Respond with hello world and your name", // sends to all on spawn + "init_message": "Respond with hello world and your name", // sends to all on spawn "only_chat_with": [], // users that the bots listen to and send general messages to. if empty it will chat publicly "language": "en", // translate to/from this language. Supports these language names: https://cloud.google.com/translate/docs/languages From 65113c706f0a98176ae41d793f3d9ca9b52d2863 Mon Sep 17 00:00:00 2001 From: gmuffiness Date: Mon, 20 Jan 2025 11:48:38 +0900 Subject: [PATCH 05/27] chore: remove duplcated func --- src/agent/prompter.js | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/src/agent/prompter.js b/src/agent/prompter.js index 4c7507d..11ae554 100644 --- a/src/agent/prompter.js +++ b/src/agent/prompter.js @@ -287,29 +287,6 @@ export class Prompter { ); } - async promptImageConvo(messages, imageBuffer) { - await this.checkCooldown(); - let prompt = this.profile.image_conversing; - let imageMessages = [...messages]; - imageMessages.push({ - role: "user", - content: [ - { type: "text", text: "Briefly describe the screen you are looking at now." }, - { - type: "image_url", - image_url: { - "url": `data:image/jpeg;base64,${imageBuffer.toString('base64')}`, - } - } - ] - }); - - return await this.chat_model.sendRequest( - imageMessages, - prompt - ); - } - async promptCoding(messages) { if (this.awaiting_coding) { console.warn('Already awaiting coding response, returning no response.'); From 55c045f57f663dabf50aa5284068127af6974af5 Mon Sep 17 00:00:00 2001 From: gmuffiness Date: Mon, 20 Jan 2025 13:11:57 +0900 Subject: [PATCH 06/27] fix: update camera direction to use mineflayer viewer setFirstPersonCamera api --- src/agent/library/skills.js | 6 +++--- src/utils/camera.js | 16 ++-------------- 2 files changed, 5 insertions(+), 17 deletions(-) diff --git a/src/agent/library/skills.js b/src/agent/library/skills.js index 389540a..067bd6e 100644 --- a/src/agent/library/skills.js +++ b/src/agent/library/skills.js @@ -1369,7 +1369,7 @@ export async function lookAtPlayer(agent, bot, player_name, direction) { await new Promise(resolve => setTimeout(resolve, 500)); log(bot, `Looking in the same direction as ${player_name}`); - filename = await camera.captureDirection(player.yaw, player.pitch); + filename = await camera.capture(); console.log(player.yaw, player.pitch); // log(bot, `Screenshot saved: bots/${bot.username}/screenshots/${filename}.jpg`); @@ -1380,7 +1380,7 @@ export async function lookAtPlayer(agent, bot, player_name, direction) { await new Promise(resolve => setTimeout(resolve, 500)); log(bot, `Looking at player ${player_name}`); - filename = await camera.captureTargetPoint(player.position.x, player.position.y + player.height, player.position.z); + filename = await camera.capture(); // log(bot, `Screenshot saved: bots/${bot.username}/screenshots/${filename}.jpg`); // log(bot, `Target coordinates: x:${player.position.x}, y:${player.position.y}, z:${player.position.z}`); } @@ -1403,7 +1403,7 @@ export async function lookAtPosition(agent, bot, x, y, z) { await new Promise(resolve => setTimeout(resolve, 500)); log(bot, `Looking at coordinate ${x, y, z}`); - let filename = await camera.captureTargetPoint(x, y + 2, z); + let filename = await camera.capture(); try { const imageBuffer = fs.readFileSync(`bots/${bot.username}/screenshots/${filename}.jpg`); diff --git a/src/utils/camera.js b/src/utils/camera.js index 54a0c7e..b710076 100644 --- a/src/utils/camera.js +++ b/src/utils/camera.js @@ -36,24 +36,12 @@ export class Camera extends EventEmitter { this.viewer.listen(worldView) this.viewer.camera.position.set(center.x, center.y, center.z) + this.viewer.setFirstPersonCamera(this.bot.entity.position, this.bot.entity.yaw, this.bot.entity.pitch) await worldView.init(center) } - async captureTargetPoint(x, y, z) { - this.viewer.camera.lookAt(x, y, z); - const filename = await this._capture(); - return filename; - } - - async captureDirection(yaw, pitch) { - this.viewer.camera.rotation.y = yaw; - this.viewer.camera.rotation.x = pitch; - const filename = await this._capture(); - return filename; - } - - async _capture() { + async capture() { console.info('Waiting for camera to load'); await new Promise(resolve => setTimeout(resolve, 5000)); this.renderer.render(this.viewer.scene, this.viewer.camera); From e4eda9c16a725fc888c303a58bab1e44966994d7 Mon Sep 17 00:00:00 2001 From: gmuffiness Date: Mon, 20 Jan 2025 14:26:14 +0900 Subject: [PATCH 07/27] chore: remove unnecessary 5-second sleep --- src/utils/camera.js | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/utils/camera.js b/src/utils/camera.js index b710076..3c23adb 100644 --- a/src/utils/camera.js +++ b/src/utils/camera.js @@ -42,8 +42,6 @@ export class Camera extends EventEmitter { } async capture() { - console.info('Waiting for camera to load'); - await new Promise(resolve => setTimeout(resolve, 5000)); this.renderer.render(this.viewer.scene, this.viewer.camera); const imageStream = this.canvas.createJPEGStream({ From 5fce0acaac0c4022b306e4ade976b77ed1eb369f Mon Sep 17 00:00:00 2001 From: gmuffiness Date: Fri, 24 Jan 2025 13:16:36 +0900 Subject: [PATCH 08/27] feat: move vision functions from skill into vision_intepreter --- profiles/_default.json | 2 +- settings.js | 1 + src/agent/agent.js | 3 + src/agent/commands/actions.js | 4 +- src/agent/library/skills.js | 128 ++++++++++++++++---------------- src/agent/prompter.js | 4 +- src/agent/vision_interpreter.js | 95 ++++++++++++++++++++++++ src/utils/camera.js | 11 ++- 8 files changed, 175 insertions(+), 73 deletions(-) create mode 100644 src/agent/vision_interpreter.js diff --git a/profiles/_default.json b/profiles/_default.json index f0ed592..474b407 100644 --- a/profiles/_default.json +++ b/profiles/_default.json @@ -9,7 +9,7 @@ "bot_responder": "You are a minecraft bot named $NAME that is currently in conversation with another AI bot. Both of you can take actions with the !command syntax, and actions take time to complete. You are currently busy with the following action: '$ACTION' but have received a new message. Decide whether to 'respond' immediately or 'ignore' it and wait for your current action to finish. Be conservative and only respond when necessary, like when you need to change/stop your action, or convey necessary information. Example 1: You:Building a house! !newAction('Build a house.').\nOther Bot: 'Come here!'\nYour decision: ignore\nExample 2: You:Collecting dirt !collectBlocks('dirt',10).\nOther Bot: 'No, collect some wood instead.'\nYour decision: respond\nExample 3: You:Coming to you now. !goToPlayer('billy',3).\nOther Bot: 'What biome are you in?'\nYour decision: respond\nActual Conversation: $TO_SUMMARIZE\nDecide by outputting ONLY 'respond' or 'ignore', nothing else. Your decision:", - "image_conversing": "You are a playful Minecraft bot. Briefly describe the screen you are looking at now.", + "image_conversing": "Summarize the visible Minecraft screen by listing the types and arrangements of blocks, focusing on terrain, structures, and notable features in 500 chars.", "modes": { "self_preservation": true, diff --git a/settings.js b/settings.js index a4681fa..c6f7251 100644 --- a/settings.js +++ b/settings.js @@ -32,6 +32,7 @@ export default "show_bot_views": false, // show bot's view in browser at localhost:3000, 3001... "allow_insecure_coding": false, // allows newAction command and model can write/run code on your computer. enable at own risk + "allow_vision": true, // allows vision model to interpret screenshots as inputs "code_timeout_mins": 10, // minutes code is allowed to run. -1 for no timeout "max_messages": 15, // max number of messages to keep in context diff --git a/src/agent/agent.js b/src/agent/agent.js index 4691079..d14b12a 100644 --- a/src/agent/agent.js +++ b/src/agent/agent.js @@ -1,5 +1,6 @@ import { History } from './history.js'; import { Coder } from './coder.js'; +import { VisionInterpreter } from './vision_interpreter.js'; import { Prompter } from './prompter.js'; import { initModes } from './modes.js'; import { initBot } from '../utils/mcdata.js'; @@ -36,6 +37,8 @@ export class Agent { this.history = new History(this); console.log('Initializing coder...'); this.coder = new Coder(this); + console.log('Initializing vision intepreter...'); + this.vision_interpreter = new VisionInterpreter(this, settings.allow_vision); console.log('Initializing npc controller...'); this.npc = new NPCContoller(this); console.log('Initializing memory bank...'); diff --git a/src/agent/commands/actions.js b/src/agent/commands/actions.js index 09f6369..1ee8192 100644 --- a/src/agent/commands/actions.js +++ b/src/agent/commands/actions.js @@ -422,7 +422,7 @@ export const actionsList = [ } }, perform: runAsAction(async (agent, player_name, direction) => { - await skills.lookAtPlayer(agent, agent.bot, player_name, direction); + await agent.vision_interpreter.lookAtPlayer(player_name, direction); }) }, { @@ -434,7 +434,7 @@ export const actionsList = [ 'z': { type: 'int', description: 'z coordinate' } }, perform: runAsAction(async (agent, x, y, z) => { - await skills.lookAtPosition(agent, agent.bot, x, y, z); + await agent.vision_interpreter.lookAtPosition(x, y, z); }) } ]; diff --git a/src/agent/library/skills.js b/src/agent/library/skills.js index 067bd6e..db05e7c 100644 --- a/src/agent/library/skills.js +++ b/src/agent/library/skills.js @@ -1343,76 +1343,76 @@ export async function activateNearestBlock(bot, type) { return true; } -export async function lookAtPlayer(agent, bot, player_name, direction) { - /** - * Look at a player or look in the same direction as the player - * @param {MinecraftBot} bot reference to the minecraft bot - * @param {string} player_name name of the target player - * @param {string} direction 'at' to look at player, 'with' to look in same direction - * @returns {Promise} whether the look action was successful - * @example - * await skills.lookAtPlayer(bot, "player1", "at"); - * await skills.lookAtPlayer(bot, "player1", "with"); - **/ +// export async function lookAtPlayer(agent, bot, player_name, direction) { +// /** +// * Look at a player or look in the same direction as the player +// * @param {MinecraftBot} bot reference to the minecraft bot +// * @param {string} player_name name of the target player +// * @param {string} direction 'at' to look at player, 'with' to look in same direction +// * @returns {Promise} whether the look action was successful +// * @example +// * await skills.lookAtPlayer(bot, "player1", "at"); +// * await skills.lookAtPlayer(bot, "player1", "with"); +// **/ - const player = bot.players[player_name]?.entity; - if (!player) { - log(bot, `Could not find player ${player_name}`); - return false; - } +// const player = bot.players[player_name]?.entity; +// if (!player) { +// log(bot, `Could not find player ${player_name}`); +// return false; +// } - let filename; - if (direction === 'with') { - // Copy player's view direction - await bot.look(player.yaw, player.pitch); - const camera = new Camera(bot); - await new Promise(resolve => setTimeout(resolve, 500)); - log(bot, `Looking in the same direction as ${player_name}`); +// let filename; +// if (direction === 'with') { +// // Copy player's view direction +// await bot.look(player.yaw, player.pitch); +// const camera = new Camera(bot); +// await new Promise(resolve => setTimeout(resolve, 500)); +// log(bot, `Looking in the same direction as ${player_name}`); - filename = await camera.capture(); - console.log(player.yaw, player.pitch); - // log(bot, `Screenshot saved: bots/${bot.username}/screenshots/${filename}.jpg`); +// filename = await camera.capture(); +// console.log(player.yaw, player.pitch); +// // log(bot, `Screenshot saved: bots/${bot.username}/screenshots/${filename}.jpg`); - } else { - // Look at player's position - await bot.lookAt(new Vec3(player.position.x, player.position.y + player.height, player.position.z)); - const camera = new Camera(bot); - await new Promise(resolve => setTimeout(resolve, 500)); - log(bot, `Looking at player ${player_name}`); +// } else { +// // Look at player's position +// await bot.lookAt(new Vec3(player.position.x, player.position.y + player.height, player.position.z)); +// const camera = new Camera(bot); +// await new Promise(resolve => setTimeout(resolve, 500)); +// log(bot, `Looking at player ${player_name}`); - filename = await camera.capture(); - // log(bot, `Screenshot saved: bots/${bot.username}/screenshots/${filename}.jpg`); - // log(bot, `Target coordinates: x:${player.position.x}, y:${player.position.y}, z:${player.position.z}`); - } +// filename = await camera.capture(); +// // log(bot, `Screenshot saved: bots/${bot.username}/screenshots/${filename}.jpg`); +// // log(bot, `Target coordinates: x:${player.position.x}, y:${player.position.y}, z:${player.position.z}`); +// } - try { - const imageBuffer = fs.readFileSync(`bots/${bot.username}/screenshots/${filename}.jpg`); - const messages = agent.history.getHistory(); - let res = await agent.prompter.promptImageConvo(messages, imageBuffer); - log(bot, res); - return true; - } catch (error) { - log(bot, `Error analyzing image: ${error.message}`); - return false; - } -} +// try { +// const imageBuffer = fs.readFileSync(`bots/${bot.username}/screenshots/${filename}.jpg`); +// const messages = agent.history.getHistory(); +// let res = await agent.prompter.promptImageConvo(messages, imageBuffer); +// log(bot, res); +// return true; +// } catch (error) { +// log(bot, `Error analyzing image: ${error.message}`); +// return false; +// } +// } -export async function lookAtPosition(agent, bot, x, y, z) { - await bot.lookAt(new Vec3(x, y + 2, z)); - const camera = new Camera(bot); - await new Promise(resolve => setTimeout(resolve, 500)); - log(bot, `Looking at coordinate ${x, y, z}`); +// export async function lookAtPosition(agent, bot, x, y, z) { +// await bot.lookAt(new Vec3(x, y + 2, z)); +// const camera = new Camera(bot); +// await new Promise(resolve => setTimeout(resolve, 500)); +// log(bot, `Looking at coordinate ${x, y, z}`); - let filename = await camera.capture(); +// let filename = await camera.capture(); - try { - const imageBuffer = fs.readFileSync(`bots/${bot.username}/screenshots/${filename}.jpg`); - const messages = agent.history.getHistory(); - let res = await agent.prompter.promptImageConvo(messages, imageBuffer); - log(bot, res); - return true; - } catch (error) { - log(bot, `Error analyzing image: ${error.message}`); - return false; - } -} +// try { +// const imageBuffer = fs.readFileSync(`bots/${bot.username}/screenshots/${filename}.jpg`); +// const messages = agent.history.getHistory(); +// let res = await agent.prompter.promptImageConvo(messages, imageBuffer); +// log(bot, res); +// return true; +// } catch (error) { +// log(bot, `Error analyzing image: ${error.message}`); +// return false; +// } +// } diff --git a/src/agent/prompter.js b/src/agent/prompter.js index 11ae554..1f46492 100644 --- a/src/agent/prompter.js +++ b/src/agent/prompter.js @@ -271,7 +271,7 @@ export class Prompter { imageMessages.push({ role: "user", content: [ - { type: "text", text: "Briefly describe the screen you are looking at now." }, + { type: "text", text: prompt }, { type: "image_url", image_url: { @@ -299,7 +299,7 @@ export class Prompter { let resp = await this.chat_model.sendRequest(messages, prompt); this.awaiting_coding = false; return resp; - } +git } async promptMemSaving(to_summarize) { await this.checkCooldown(); diff --git a/src/agent/vision_interpreter.js b/src/agent/vision_interpreter.js new file mode 100644 index 0000000..deea836 --- /dev/null +++ b/src/agent/vision_interpreter.js @@ -0,0 +1,95 @@ +import { Vec3 } from 'vec3'; +import { Camera } from "../utils/camera.js"; +import fs from 'fs'; +import { log } from './library/skills.js'; +import * as world from './library/world.js'; + +const pad = (str) => { + return '\n' + str + '\n'; +} + +export class VisionInterpreter { + constructor(agent, allow_vision) { + this.agent = agent; + this.allow_vision = allow_vision; + this.fp = './bots/'+agent.name+'/screenshots/'; + } + + async lookAtPlayer(player_name, direction) { + const bot = this.agent.bot; + const player = bot.players[player_name]?.entity; + if (!player) { + log(bot, `Could not find player ${player_name}`); + } + + let filename; + if (direction === 'with') { + await bot.look(player.yaw, player.pitch); + const camera = new Camera(bot, this.fp); + await new Promise(resolve => setTimeout(resolve, 500)); + log(bot, `Looking in the same direction as ${player_name}`); + filename = await camera.capture(); + } else { + await bot.lookAt(new Vec3(player.position.x, player.position.y + player.height, player.position.z)); + const camera = new Camera(bot, this.fp); + await new Promise(resolve => setTimeout(resolve, 500)); + log(bot, `Looking at player ${player_name}`); + filename = await camera.capture(); + } + + if (!this.allow_vision) { + log(this.agent.bot, "Vision is disabled. Using text-based environment description instead."); + log(this.agent.bot, this._nearbyBlocks()); + } else { + await this.analyzeImage(filename); + } + } + + async lookAtPosition(x, y, z) { + const bot = this.agent.bot; + await bot.lookAt(new Vec3(x, y + 2, z)); + const camera = new Camera(bot, this.fp); + await new Promise(resolve => setTimeout(resolve, 500)); + log(bot, `Looking at coordinate ${x, y, z}`); + + let filename = await camera.capture(); + + if (!this.allow_vision) { + log(this.agent.bot, "Vision is disabled. Using text-based environment description instead."); + log(this.agent.bot, this._nearbyBlocks()); + } else { + await this.analyzeImage(filename); + } + } + + async analyzeImage(filename) { + let res = null; + try { + const bot = this.agent.bot; + const imageBuffer = fs.readFileSync(`${this.fp}/${filename}.jpg`); + const messages = this.agent.history.getHistory(); + res = await this.agent.prompter.promptImageConvo(messages, imageBuffer); + log(bot, res); + } catch (error) { + log(this.agent.bot, `Error analyzing image: ${error.message}`); + } + } + + _nearbyBlocks() { + const bot = this.agent.bot; + let res = 'NEARBY_BLOCKS'; + + let blocks = world.getNearbyBlockTypes(bot); + for (let i = 0; i < blocks.length; i++) { + res += `\n- ${blocks[i]}`; + } + if (blocks.length == 0) { + res += ': none'; + } else { + // Environmental Awareness + res += '\n- ' + world.getSurroundingBlocks(bot).join('\n- ') + res += `\n- First Solid Block Above Head: ${world.getFirstBlockAboveHead(bot, null, 32)}`; + } + return pad(res); + } +} \ No newline at end of file diff --git a/src/utils/camera.js b/src/utils/camera.js index 3c23adb..7eafb42 100644 --- a/src/utils/camera.js +++ b/src/utils/camera.js @@ -13,9 +13,10 @@ global.Worker = worker_threads.Worker; export class Camera extends EventEmitter { - constructor (bot) { + constructor (bot, fp) { super() this.bot = bot + this.fp = fp this.viewDistance = 4 this.width = 800 this.height = 512 @@ -42,6 +43,8 @@ export class Camera extends EventEmitter { } async capture() { + // waits some time helps renderer to render the world view + await new Promise(resolve => setTimeout(resolve, 1000)); this.renderer.render(this.viewer.scene, this.viewer.camera); const imageStream = this.canvas.createJPEGStream({ @@ -55,7 +58,7 @@ export class Camera extends EventEmitter { const buf = await getBufferFromStream(imageStream); await this._ensureScreenshotDirectory(); - await fs.writeFile(`bots/${this.bot.username}/screenshots/${filename}.jpg`, buf); + await fs.writeFile(`${this.fp}/${filename}.jpg`, buf); console.log('saved', filename); return filename; } @@ -63,10 +66,10 @@ export class Camera extends EventEmitter { async _ensureScreenshotDirectory() { let stats; try { - stats = await fs.stat(`bots/${this.bot.username}/screenshots`); + stats = await fs.stat(this.fp); } catch (e) { if (!stats?.isDirectory()) { - await fs.mkdir(`bots/${this.bot.username}/screenshots`); + await fs.mkdir(this.fp); } } } From 7d5172628913a09407618cb0b86f59085479f59c Mon Sep 17 00:00:00 2001 From: gmuffiness Date: Fri, 24 Jan 2025 16:29:03 +0900 Subject: [PATCH 09/27] feat: remove promptImageConvo and implement sendVisionRequest to each provider --- profiles/_default.json | 2 +- src/agent/prompter.js | 23 ----------------------- src/agent/vision_interpreter.js | 8 +++++--- src/models/gpt.js | 19 +++++++++++++++++++ src/models/mistral.js | 16 ++++++++++++++++ src/utils/text.js | 4 +++- 6 files changed, 44 insertions(+), 28 deletions(-) diff --git a/profiles/_default.json b/profiles/_default.json index 474b407..fd7b1c2 100644 --- a/profiles/_default.json +++ b/profiles/_default.json @@ -9,7 +9,7 @@ "bot_responder": "You are a minecraft bot named $NAME that is currently in conversation with another AI bot. Both of you can take actions with the !command syntax, and actions take time to complete. You are currently busy with the following action: '$ACTION' but have received a new message. Decide whether to 'respond' immediately or 'ignore' it and wait for your current action to finish. Be conservative and only respond when necessary, like when you need to change/stop your action, or convey necessary information. Example 1: You:Building a house! !newAction('Build a house.').\nOther Bot: 'Come here!'\nYour decision: ignore\nExample 2: You:Collecting dirt !collectBlocks('dirt',10).\nOther Bot: 'No, collect some wood instead.'\nYour decision: respond\nExample 3: You:Coming to you now. !goToPlayer('billy',3).\nOther Bot: 'What biome are you in?'\nYour decision: respond\nActual Conversation: $TO_SUMMARIZE\nDecide by outputting ONLY 'respond' or 'ignore', nothing else. Your decision:", - "image_conversing": "Summarize the visible Minecraft screen by listing the types and arrangements of blocks, focusing on terrain, structures, and notable features in 500 chars.", + "image_conversing": "Summarize the visible Minecraft screen by listing the types and arrangements of blocks, focusing on terrain, structures, and notable features in 400 chars.", "modes": { "self_preservation": true, diff --git a/src/agent/prompter.js b/src/agent/prompter.js index 1f46492..2668ad7 100644 --- a/src/agent/prompter.js +++ b/src/agent/prompter.js @@ -264,29 +264,6 @@ export class Prompter { return ''; } - async promptImageConvo(messages, imageBuffer) { - await this.checkCooldown(); - let prompt = this.profile.image_conversing; - let imageMessages = [...messages]; - imageMessages.push({ - role: "user", - content: [ - { type: "text", text: prompt }, - { - type: "image_url", - image_url: { - "url": `data:image/jpeg;base64,${imageBuffer.toString('base64')}`, - } - } - ] - }); - - return await this.chat_model.sendRequest( - imageMessages, - prompt - ); - } - async promptCoding(messages) { if (this.awaiting_coding) { console.warn('Already awaiting coding response, returning no response.'); diff --git a/src/agent/vision_interpreter.js b/src/agent/vision_interpreter.js index deea836..38b7a44 100644 --- a/src/agent/vision_interpreter.js +++ b/src/agent/vision_interpreter.js @@ -37,7 +37,7 @@ export class VisionInterpreter { filename = await camera.capture(); } - if (!this.allow_vision) { + if (!this.allow_vision || !this.agent.prompter.chat_model.sendVisionRequest) { log(this.agent.bot, "Vision is disabled. Using text-based environment description instead."); log(this.agent.bot, this._nearbyBlocks()); } else { @@ -54,7 +54,7 @@ export class VisionInterpreter { let filename = await camera.capture(); - if (!this.allow_vision) { + if (!this.allow_vision || !this.agent.prompter.chat_model.sendVisionRequest) { log(this.agent.bot, "Vision is disabled. Using text-based environment description instead."); log(this.agent.bot, this._nearbyBlocks()); } else { @@ -63,12 +63,14 @@ export class VisionInterpreter { } async analyzeImage(filename) { + let prompt = this.agent.prompter.profile.image_conversing; let res = null; + try { const bot = this.agent.bot; const imageBuffer = fs.readFileSync(`${this.fp}/${filename}.jpg`); const messages = this.agent.history.getHistory(); - res = await this.agent.prompter.promptImageConvo(messages, imageBuffer); + res = await this.agent.prompter.chat_model.sendVisionRequest(messages, prompt, imageBuffer); log(bot, res); } catch (error) { log(this.agent.bot, `Error analyzing image: ${error.message}`); diff --git a/src/models/gpt.js b/src/models/gpt.js index da29ef1..6664409 100644 --- a/src/models/gpt.js +++ b/src/models/gpt.js @@ -53,6 +53,24 @@ export class GPT { return res; } + async sendVisionRequest(messages, systemMessage, imageBuffer) { + const imageMessages = [...messages]; + imageMessages.push({ + role: "user", + content: [ + { type: "text", text: systemMessage }, + { + type: "image_url", + image_url: { + url: `data:image/jpeg;base64,${imageBuffer.toString('base64')}` + } + } + ] + }); + + return this.sendRequest(imageMessages, systemMessage); + } + async embed(text) { const embedding = await this.openai.embeddings.create({ model: this.model_name || "text-embedding-3-small", @@ -61,6 +79,7 @@ export class GPT { }); return embedding.data[0].embedding; } + } diff --git a/src/models/mistral.js b/src/models/mistral.js index 3b41f78..124ee9a 100644 --- a/src/models/mistral.js +++ b/src/models/mistral.js @@ -61,6 +61,22 @@ export class Mistral { return result; } + async sendVisionRequest(messages, systemMessage, imageBuffer) { + const imageMessages = [...messages]; + imageMessages.push({ + role: "user", + content: [ + { type: "text", text: systemMessage }, + { + type: "image_url", + imageUrl: `data:image/jpeg;base64,${imageBuffer.toString('base64')}` + } + ] + }); + + return this.sendRequest(imageMessages, systemMessage); + } + async embed(text) { const embedding = await this.#client.embeddings.create({ model: "mistral-embed", diff --git a/src/utils/text.js b/src/utils/text.js index 1e93667..363061f 100644 --- a/src/utils/text.js +++ b/src/utils/text.js @@ -33,7 +33,9 @@ export function strictFormat(turns) { let messages = []; let filler = {role: 'user', content: '_'}; for (let msg of turns) { - msg.content = msg.content.trim(); + if (typeof msg.content === 'string') { + msg.content = msg.content.trim(); + } if (msg.role === 'system') { msg.role = 'user'; msg.content = 'SYSTEM: ' + msg.content; From d1b3232a5aed04c4fa434feb7b55301b21adc837 Mon Sep 17 00:00:00 2001 From: gmuffiness Date: Mon, 27 Jan 2025 17:29:14 +0900 Subject: [PATCH 10/27] feat: add groq vision request --- src/models/groq.js | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/src/models/groq.js b/src/models/groq.js index e17f13d..025875f 100644 --- a/src/models/groq.js +++ b/src/models/groq.js @@ -15,8 +15,10 @@ export class GroqCloudAPI { this.groq = new Groq({ apiKey: getKey('GROQCLOUD_API_KEY') }); } - async sendRequest(turns, systemMessage, stop_seq=null) { - let messages = [{"role": "system", "content": systemMessage}].concat(turns); + async sendRequest(turns, systemMessage=null, stop_seq=null) { + let messages = systemMessage + ? [{"role": "system", "content": systemMessage}].concat(turns) + : turns; let res = null; try { console.log("Awaiting Groq response..."); @@ -45,6 +47,24 @@ export class GroqCloudAPI { return res; } + async sendVisionRequest(messages, systemMessage, imageBuffer) { + const imageMessages = [...messages]; + imageMessages.push({ + role: "user", + content: [ + { type: "text", text: systemMessage }, + { + type: "image_url", + image_url: { + url: `data:image/jpeg;base64,${imageBuffer.toString('base64')}` + } + } + ] + }); + + return this.sendRequest(imageMessages); + } + async embed(text) { console.log("There is no support for embeddings in Groq support. However, the following text was provided: " + text); } From 4281ee246838a1b90ed73ef75dd1c058c0c63b87 Mon Sep 17 00:00:00 2001 From: gmuffiness Date: Tue, 28 Jan 2025 02:25:41 +0900 Subject: [PATCH 11/27] feat: add gemini vision request --- src/models/gemini.js | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/src/models/gemini.js b/src/models/gemini.js index 1536d66..27765e3 100644 --- a/src/models/gemini.js +++ b/src/models/gemini.js @@ -59,6 +59,42 @@ export class Gemini { return text.slice(0, idx); } + async sendVisionRequest(turns, systemMessage, imageBuffer) { + let model; + if (this.url) { + model = this.genAI.getGenerativeModel( + { model: this.model_name || "gemini-1.5-pro-vision" }, + { baseUrl: this.url }, + { safetySettings: this.safetySettings } + ); + } else { + model = this.genAI.getGenerativeModel( + { model: this.model_name || "gemini-1.5-pro-vision" }, + { safetySettings: this.safetySettings } + ); + } + + const imagePart = { + inlineData: { + data: imageBuffer.toString('base64'), + mimeType: 'image/jpeg' + } + }; + + const stop_seq = '***'; + const prompt = toSinglePrompt(turns, systemMessage, stop_seq, 'model'); + + console.log('Awaiting Google API vision response...'); + const result = await model.generateContent([prompt, imagePart]); + const response = await result.response; + const text = response.text(); + console.log('Received.'); + + if (!text.includes(stop_seq)) return text; + const idx = text.indexOf(stop_seq); + return text.slice(0, idx); + } + async embed(text) { let model; if (this.url) { From 116ef461067decd4a9bd633f48759e2fb92993a4 Mon Sep 17 00:00:00 2001 From: gmuffiness Date: Tue, 28 Jan 2025 02:40:04 +0900 Subject: [PATCH 12/27] chore: minor change --- src/models/gemini.js | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/models/gemini.js b/src/models/gemini.js index 27765e3..5c5b453 100644 --- a/src/models/gemini.js +++ b/src/models/gemini.js @@ -63,13 +63,13 @@ export class Gemini { let model; if (this.url) { model = this.genAI.getGenerativeModel( - { model: this.model_name || "gemini-1.5-pro-vision" }, + { model: this.model_name || "gemini-1.5-flash" }, { baseUrl: this.url }, { safetySettings: this.safetySettings } ); } else { model = this.genAI.getGenerativeModel( - { model: this.model_name || "gemini-1.5-pro-vision" }, + { model: this.model_name || "gemini-1.5-flash" }, { safetySettings: this.safetySettings } ); } @@ -89,7 +89,6 @@ export class Gemini { const response = await result.response; const text = response.text(); console.log('Received.'); - if (!text.includes(stop_seq)) return text; const idx = text.indexOf(stop_seq); return text.slice(0, idx); From 308e092a8145df40212a8b21ea3484bbf5f711e7 Mon Sep 17 00:00:00 2001 From: gmuffiness Date: Wed, 29 Jan 2025 01:38:49 +0900 Subject: [PATCH 13/27] feat: add anthropic vision request --- src/models/claude.js | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/src/models/claude.js b/src/models/claude.js index c97ecb2..fb5c740 100644 --- a/src/models/claude.js +++ b/src/models/claude.js @@ -37,10 +37,30 @@ export class Claude { return res; } + async sendVisionRequest(turns, systemMessage, imageBuffer) { + const imageMessages = [...turns]; + imageMessages.push({ + role: "user", + content: [ + { + type: "text", + text: systemMessage + }, + { + type: "image", + source: { + type: "base64", + media_type: "image/jpeg", + data: imageBuffer.toString('base64') + } + } + ] + }); + + return this.sendRequest(imageMessages, systemMessage); + } + async embed(text) { throw new Error('Embeddings are not supported by Claude.'); } } - - - From 2b5923f98f7ac09e63d8245d42325a16a596d2b7 Mon Sep 17 00:00:00 2001 From: gmuffiness Date: Sun, 9 Feb 2025 21:57:45 +0900 Subject: [PATCH 14/27] feat: add vision_model param to profile --- src/agent/vision_interpreter.js | 6 +++--- src/models/mistral.js | 1 + src/models/prompter.js | 8 ++++++++ 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/agent/vision_interpreter.js b/src/agent/vision_interpreter.js index 38b7a44..94be3de 100644 --- a/src/agent/vision_interpreter.js +++ b/src/agent/vision_interpreter.js @@ -37,7 +37,7 @@ export class VisionInterpreter { filename = await camera.capture(); } - if (!this.allow_vision || !this.agent.prompter.chat_model.sendVisionRequest) { + if (!this.allow_vision || !this.agent.prompter.vision_model.sendVisionRequest) { log(this.agent.bot, "Vision is disabled. Using text-based environment description instead."); log(this.agent.bot, this._nearbyBlocks()); } else { @@ -54,7 +54,7 @@ export class VisionInterpreter { let filename = await camera.capture(); - if (!this.allow_vision || !this.agent.prompter.chat_model.sendVisionRequest) { + if (!this.allow_vision || !this.agent.prompter.vision_model.sendVisionRequest) { log(this.agent.bot, "Vision is disabled. Using text-based environment description instead."); log(this.agent.bot, this._nearbyBlocks()); } else { @@ -70,7 +70,7 @@ export class VisionInterpreter { const bot = this.agent.bot; const imageBuffer = fs.readFileSync(`${this.fp}/${filename}.jpg`); const messages = this.agent.history.getHistory(); - res = await this.agent.prompter.chat_model.sendVisionRequest(messages, prompt, imageBuffer); + res = await this.agent.prompter.vision_model.sendVisionRequest(messages, prompt, imageBuffer); log(bot, res); } catch (error) { log(this.agent.bot, `Error analyzing image: ${error.message}`); diff --git a/src/models/mistral.js b/src/models/mistral.js index 01e0ecf..f1f3563 100644 --- a/src/models/mistral.js +++ b/src/models/mistral.js @@ -47,6 +47,7 @@ export class Mistral { ]; messages.push(...strictFormat(turns)); + console.log('Awaiting mistral api response...') const response = await this.#client.chat.complete({ model, messages, diff --git a/src/models/prompter.js b/src/models/prompter.js index a29a0b9..5ac6a1f 100644 --- a/src/models/prompter.js +++ b/src/models/prompter.js @@ -65,6 +65,14 @@ export class Prompter { this.code_model = this.chat_model; } + if (this.profile.vision_model) { + let vision_model_profile = this._selectAPI(this.profile.vision_model); + this.vision_model = this._createModel(vision_model_profile); + } + else { + this.vision_model = this.chat_model; + } + let embedding = this.profile.embedding; if (embedding === undefined) { if (chat_model_profile.api !== 'ollama') From 647655f20634346831902438e875c7913d9c6991 Mon Sep 17 00:00:00 2001 From: gmuffiness Date: Sun, 9 Feb 2025 22:08:13 +0900 Subject: [PATCH 15/27] docs: add vision_model to readme --- README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index a30e368..5690e9f 100644 --- a/README.md +++ b/README.md @@ -123,6 +123,11 @@ You can pass a string or an object for these fields. A model object must specify "model": "gpt-4", "url": "https://api.openai.com/v1/" }, +"vision_model": { + "api": "openai", + "model": "gpt-4o", + "url": "https://api.openai.com/v1/" +}, "embedding": { "api": "openai", "url": "https://api.openai.com/v1/", @@ -131,7 +136,7 @@ You can pass a string or an object for these fields. A model object must specify ``` -`model` is used for chat, `code_model` is used for newAction coding, and `embedding` is used to embed text for example selection. If `code_model` is not specified, then it will use `model` for coding. +`model` is used for chat, `code_model` is used for newAction coding, `vision_model` is used for image interpretation, and `embedding` is used to embed text for example selection. If `code_model` or `vision_model` is not specified, `model` will be used by default. All apis have default models and urls, so those fields are optional. Note some apis have no embedding model, so they will default to word overlap to retrieve examples. From 430ae24d206f12d1147963ade0cd78c596715645 Mon Sep 17 00:00:00 2001 From: gmuffiness Date: Mon, 10 Feb 2025 02:03:25 +0900 Subject: [PATCH 16/27] fix: use text description when vision features are used with a non-vision model --- src/agent/vision_interpreter.js | 9 ++++++++- src/models/claude.js | 6 +++++- src/models/gemini.js | 28 +++++++++++++++++++--------- src/models/gpt.js | 3 +++ src/models/grok.js | 21 +++++++++++++++++++++ src/models/groq.js | 12 +++++++----- src/models/mistral.js | 9 ++++++--- 7 files changed, 69 insertions(+), 19 deletions(-) diff --git a/src/agent/vision_interpreter.js b/src/agent/vision_interpreter.js index 94be3de..9c66cc8 100644 --- a/src/agent/vision_interpreter.js +++ b/src/agent/vision_interpreter.js @@ -71,7 +71,14 @@ export class VisionInterpreter { const imageBuffer = fs.readFileSync(`${this.fp}/${filename}.jpg`); const messages = this.agent.history.getHistory(); res = await this.agent.prompter.vision_model.sendVisionRequest(messages, prompt, imageBuffer); - log(bot, res); + + if (res == 'Vision is only supported by certain models.') { + log(bot, "Vision may not be supported on this model. Using text-based environment description instead."); + log(bot, this._nearbyBlocks()); + } else { + log(bot, res); + } + } catch (error) { log(this.agent.bot, `Error analyzing image: ${error.message}`); } diff --git a/src/models/claude.js b/src/models/claude.js index 236a0bf..74095ef 100644 --- a/src/models/claude.js +++ b/src/models/claude.js @@ -35,8 +35,12 @@ export class Claude { res = resp.content[0].text; } catch (err) { + if (err.message.includes("does not support image input")) { + res = "Vision is only supported by certain models."; + } else { + res = "My brain disconnected, try again."; + } console.log(err); - res = 'My brain disconnected, try again.'; } return res; } diff --git a/src/models/gemini.js b/src/models/gemini.js index bc17a57..4c35526 100644 --- a/src/models/gemini.js +++ b/src/models/gemini.js @@ -102,15 +102,25 @@ export class Gemini { const stop_seq = '***'; const prompt = toSinglePrompt(turns, systemMessage, stop_seq, 'model'); - - console.log('Awaiting Google API vision response...'); - const result = await model.generateContent([prompt, imagePart]); - const response = await result.response; - const text = response.text(); - console.log('Received.'); - if (!text.includes(stop_seq)) return text; - const idx = text.indexOf(stop_seq); - return text.slice(0, idx); + let res = null; + try { + console.log('Awaiting Google API vision response...'); + const result = await model.generateContent([prompt, imagePart]); + const response = await result.response; + const text = response.text(); + console.log('Received.'); + if (!text.includes(stop_seq)) return text; + const idx = text.indexOf(stop_seq); + res = text.slice(0, idx); + } catch (err) { + console.log(err); + if (err.message.includes("Image input modality is not enabled for models/")) { + res = "Vision is only supported by certain models."; + } else { + res = "An unexpected error occurred, please try again."; + } + } + return res; } async embed(text) { diff --git a/src/models/gpt.js b/src/models/gpt.js index 981f0b2..8540778 100644 --- a/src/models/gpt.js +++ b/src/models/gpt.js @@ -48,6 +48,9 @@ export class GPT { if ((err.message == 'Context length exceeded' || err.code == 'context_length_exceeded') && turns.length > 1) { console.log('Context length exceeded, trying again with shorter context.'); return await this.sendRequest(turns.slice(1), systemMessage, stop_seq); + } else if (err.message.includes('image_url')) { + console.log(err); + res = 'Vision is only supported by certain models.'; } else { console.log(err); res = 'My brain disconnected, try again.'; diff --git a/src/models/grok.js b/src/models/grok.js index a8c6672..2878a10 100644 --- a/src/models/grok.js +++ b/src/models/grok.js @@ -43,6 +43,9 @@ export class Grok { if ((err.message == 'Context length exceeded' || err.code == 'context_length_exceeded') && turns.length > 1) { console.log('Context length exceeded, trying again with shorter context.'); return await this.sendRequest(turns.slice(1), systemMessage, stop_seq); + } else if (err.message.includes('The model expects a single `text` element per message.')) { + console.log(err); + res = 'Vision is only supported by certain models.'; } else { console.log(err); res = 'My brain disconnected, try again.'; @@ -51,6 +54,24 @@ export class Grok { // sometimes outputs special token <|separator|>, just replace it return res.replace(/<\|separator\|>/g, '*no response*'); } + + async sendVisionRequest(messages, systemMessage, imageBuffer) { + const imageMessages = [...messages]; + imageMessages.push({ + role: "user", + content: [ + { type: "text", text: systemMessage }, + { + type: "image_url", + image_url: { + url: `data:image/jpeg;base64,${imageBuffer.toString('base64')}` + } + } + ] + }); + + return this.sendRequest(imageMessages, systemMessage); + } async embed(text) { throw new Error('Embeddings are not supported by Grok.'); diff --git a/src/models/groq.js b/src/models/groq.js index 0ec99f6..4c8d1e6 100644 --- a/src/models/groq.js +++ b/src/models/groq.js @@ -23,9 +23,6 @@ export class GroqCloudAPI { let res = null; try { console.log("Awaiting Groq response..."); - if (!this.params.max_tokens) { - this.params.max_tokens = 16384; - } let completion = await this.groq.chat.completions.create({ "messages": messages, "model": this.model_name || "mixtral-8x7b-32768", @@ -43,14 +40,19 @@ export class GroqCloudAPI { } catch(err) { + if (err.message.includes("content must be a string")) { + res = "Vision is only supported by certain models."; + } else { + console.log(this.model_name); + res = "My brain disconnected, try again."; + } console.log(err); - res = "My brain just kinda stopped working. Try again."; } return res; } async sendVisionRequest(messages, systemMessage, imageBuffer) { - const imageMessages = [...messages]; + const imageMessages = messages.filter(message => message.role !== 'system'); imageMessages.push({ role: "user", content: [ diff --git a/src/models/mistral.js b/src/models/mistral.js index f1f3563..72448f1 100644 --- a/src/models/mistral.js +++ b/src/models/mistral.js @@ -56,9 +56,12 @@ export class Mistral { result = response.choices[0].message.content; } catch (err) { - console.log(err) - - result = "My brain disconnected, try again."; + if (err.message.includes("A request containing images has been given to a model which does not have the 'vision' capability.")) { + result = "Vision is only supported by certain models."; + } else { + result = "My brain disconnected, try again."; + } + console.log(err); } return result; From 359592825a314be6ff0cebae2c391864674a9733 Mon Sep 17 00:00:00 2001 From: gmuffiness Date: Wed, 19 Feb 2025 11:14:08 +0900 Subject: [PATCH 17/27] fix: update package --- package.json | 2 +- src/agent/library/skills.js | 76 +------------------------------------ 2 files changed, 2 insertions(+), 76 deletions(-) diff --git a/package.json b/package.json index 1060900..7c2348a 100644 --- a/package.json +++ b/package.json @@ -25,7 +25,7 @@ "socket.io": "^4.7.2", "socket.io-client": "^4.7.2", "express": "^4.18.2", - "node-canvas-webgl": "^0.2.6", + "node-canvas-webgl": "PrismarineJS/node-canvas-webgl", "three": "^0.128.0" }, "scripts": { diff --git a/src/agent/library/skills.js b/src/agent/library/skills.js index 859e53d..8f7c53c 100644 --- a/src/agent/library/skills.js +++ b/src/agent/library/skills.js @@ -1352,78 +1352,4 @@ export async function activateNearestBlock(bot, type) { await bot.activateBlock(block); log(bot, `Activated ${type} at x:${block.position.x.toFixed(1)}, y:${block.position.y.toFixed(1)}, z:${block.position.z.toFixed(1)}.`); return true; -} - -// export async function lookAtPlayer(agent, bot, player_name, direction) { -// /** -// * Look at a player or look in the same direction as the player -// * @param {MinecraftBot} bot reference to the minecraft bot -// * @param {string} player_name name of the target player -// * @param {string} direction 'at' to look at player, 'with' to look in same direction -// * @returns {Promise} whether the look action was successful -// * @example -// * await skills.lookAtPlayer(bot, "player1", "at"); -// * await skills.lookAtPlayer(bot, "player1", "with"); -// **/ - -// const player = bot.players[player_name]?.entity; -// if (!player) { -// log(bot, `Could not find player ${player_name}`); -// return false; -// } - -// let filename; -// if (direction === 'with') { -// // Copy player's view direction -// await bot.look(player.yaw, player.pitch); -// const camera = new Camera(bot); -// await new Promise(resolve => setTimeout(resolve, 500)); -// log(bot, `Looking in the same direction as ${player_name}`); - -// filename = await camera.capture(); -// console.log(player.yaw, player.pitch); -// // log(bot, `Screenshot saved: bots/${bot.username}/screenshots/${filename}.jpg`); - -// } else { -// // Look at player's position -// await bot.lookAt(new Vec3(player.position.x, player.position.y + player.height, player.position.z)); -// const camera = new Camera(bot); -// await new Promise(resolve => setTimeout(resolve, 500)); -// log(bot, `Looking at player ${player_name}`); - -// filename = await camera.capture(); -// // log(bot, `Screenshot saved: bots/${bot.username}/screenshots/${filename}.jpg`); -// // log(bot, `Target coordinates: x:${player.position.x}, y:${player.position.y}, z:${player.position.z}`); -// } - -// try { -// const imageBuffer = fs.readFileSync(`bots/${bot.username}/screenshots/${filename}.jpg`); -// const messages = agent.history.getHistory(); -// let res = await agent.prompter.promptImageConvo(messages, imageBuffer); -// log(bot, res); -// return true; -// } catch (error) { -// log(bot, `Error analyzing image: ${error.message}`); -// return false; -// } -// } - -// export async function lookAtPosition(agent, bot, x, y, z) { -// await bot.lookAt(new Vec3(x, y + 2, z)); -// const camera = new Camera(bot); -// await new Promise(resolve => setTimeout(resolve, 500)); -// log(bot, `Looking at coordinate ${x, y, z}`); - -// let filename = await camera.capture(); - -// try { -// const imageBuffer = fs.readFileSync(`bots/${bot.username}/screenshots/${filename}.jpg`); -// const messages = agent.history.getHistory(); -// let res = await agent.prompter.promptImageConvo(messages, imageBuffer); -// log(bot, res); -// return true; -// } catch (error) { -// log(bot, `Error analyzing image: ${error.message}`); -// return false; -// } -// } +} \ No newline at end of file From 6ec49e77896682abec8488343b16fa9b3ea51350 Mon Sep 17 00:00:00 2001 From: MaxRobinsonTheGreat Date: Wed, 5 Mar 2025 15:23:57 -0600 Subject: [PATCH 18/27] reworked image prompting, update package --- package.json | 12 ++--- profiles/defaults/_default.json | 2 +- src/agent/commands/actions.js | 31 ++++++++----- src/agent/vision_interpreter.js | 78 ++++++++++----------------------- src/models/prompter.js | 7 +++ 5 files changed, 57 insertions(+), 73 deletions(-) diff --git a/package.json b/package.json index 7c2348a..c55dd3a 100644 --- a/package.json +++ b/package.json @@ -5,28 +5,28 @@ "@google/generative-ai": "^0.2.1", "@huggingface/inference": "^2.8.1", "@mistralai/mistralai": "^1.1.0", + "express": "^4.18.2", "google-translate-api-x": "^10.7.1", "groq-sdk": "^0.5.0", "minecraft-data": "^3.78.0", - "mineflayer": "^4.23.0", + "mineflayer": "^4.26.0", "mineflayer-armor-manager": "^2.0.1", "mineflayer-auto-eat": "^3.3.6", "mineflayer-collectblock": "^1.4.1", "mineflayer-pathfinder": "^2.4.5", "mineflayer-pvp": "^1.3.2", + "node-canvas-webgl": "PrismarineJS/node-canvas-webgl", "openai": "^4.4.0", "patch-package": "^8.0.0", "prismarine-item": "^1.15.0", "prismarine-viewer": "^1.28.0", "replicate": "^0.29.4", "ses": "^1.9.1", - "vec3": "^0.1.10", - "yargs": "^17.7.2", "socket.io": "^4.7.2", "socket.io-client": "^4.7.2", - "express": "^4.18.2", - "node-canvas-webgl": "PrismarineJS/node-canvas-webgl", - "three": "^0.128.0" + "three": "^0.128.0", + "vec3": "^0.1.10", + "yargs": "^17.7.2" }, "scripts": { "postinstall": "patch-package", diff --git a/profiles/defaults/_default.json b/profiles/defaults/_default.json index fd7b1c2..491966e 100644 --- a/profiles/defaults/_default.json +++ b/profiles/defaults/_default.json @@ -9,7 +9,7 @@ "bot_responder": "You are a minecraft bot named $NAME that is currently in conversation with another AI bot. Both of you can take actions with the !command syntax, and actions take time to complete. You are currently busy with the following action: '$ACTION' but have received a new message. Decide whether to 'respond' immediately or 'ignore' it and wait for your current action to finish. Be conservative and only respond when necessary, like when you need to change/stop your action, or convey necessary information. Example 1: You:Building a house! !newAction('Build a house.').\nOther Bot: 'Come here!'\nYour decision: ignore\nExample 2: You:Collecting dirt !collectBlocks('dirt',10).\nOther Bot: 'No, collect some wood instead.'\nYour decision: respond\nExample 3: You:Coming to you now. !goToPlayer('billy',3).\nOther Bot: 'What biome are you in?'\nYour decision: respond\nActual Conversation: $TO_SUMMARIZE\nDecide by outputting ONLY 'respond' or 'ignore', nothing else. Your decision:", - "image_conversing": "Summarize the visible Minecraft screen by listing the types and arrangements of blocks, focusing on terrain, structures, and notable features in 400 chars.", + "image_analysis": "You are a Minecraft bot named $NAME that has been given a screenshot of your current view. Analyze and summarize the view; describe terrain, blocks, structures, and notable features. Focus on details relevant to the conversation. Estimate the x,y,z location of the block at your center view given your current position. Be extremely concise and correct, respond only with your analysis, not conversationally. $STATS", "modes": { "self_preservation": true, diff --git a/src/agent/commands/actions.js b/src/agent/commands/actions.js index 1ee8192..c5fc074 100644 --- a/src/agent/commands/actions.js +++ b/src/agent/commands/actions.js @@ -411,19 +411,23 @@ export const actionsList = [ name: '!lookAtPlayer', description: 'Look at a player or look in the same direction as the player.', params: { - 'player_name': { - type: 'string', - description: 'Name of the target player' - }, + 'player_name': { type: 'string', description: 'Name of the target player' }, 'direction': { type: 'string', description: 'How to look ("at": look at the player, "with": look in the same direction as the player)', - enum: ['at', 'with'] } }, - perform: runAsAction(async (agent, player_name, direction) => { - await agent.vision_interpreter.lookAtPlayer(player_name, direction); - }) + perform: async function(agent, player_name, direction) { + if (direction !== 'at' && direction !== 'with') { + return "Invalid direction. Use 'at' or 'with'."; + } + let result = ""; + const actionFn = async () => { + result = await agent.vision_interpreter.lookAtPlayer(player_name, direction); + }; + await agent.actions.runAction('action:lookAtPlayer', actionFn); + return result; + } }, { name: '!lookAtPosition', @@ -433,8 +437,13 @@ export const actionsList = [ 'y': { type: 'int', description: 'y coordinate' }, 'z': { type: 'int', description: 'z coordinate' } }, - perform: runAsAction(async (agent, x, y, z) => { - await agent.vision_interpreter.lookAtPosition(x, y, z); - }) + perform: async function(agent, x, y, z) { + let result = ""; + const actionFn = async () => { + result = await agent.vision_interpreter.lookAtPosition(x, y, z); + }; + await agent.actions.runAction('action:lookAtPosition', actionFn); + return result; + } } ]; diff --git a/src/agent/vision_interpreter.js b/src/agent/vision_interpreter.js index 9c66cc8..28c326e 100644 --- a/src/agent/vision_interpreter.js +++ b/src/agent/vision_interpreter.js @@ -1,12 +1,8 @@ import { Vec3 } from 'vec3'; import { Camera } from "../utils/camera.js"; import fs from 'fs'; -import { log } from './library/skills.js'; -import * as world from './library/world.js'; -const pad = (str) => { - return '\n' + str + '\n'; -} +const RENDER_TIME = 1000; export class VisionInterpreter { constructor(agent, allow_vision) { @@ -16,89 +12,61 @@ export class VisionInterpreter { } async lookAtPlayer(player_name, direction) { + if (!this.allow_vision || !this.agent.prompter.vision_model.sendVisionRequest) { + return "Vision is disabled. Use other methods to describe the environment."; + } + let result = ""; const bot = this.agent.bot; const player = bot.players[player_name]?.entity; if (!player) { - log(bot, `Could not find player ${player_name}`); + return `Could not find player ${player_name}`; } let filename; if (direction === 'with') { await bot.look(player.yaw, player.pitch); const camera = new Camera(bot, this.fp); - await new Promise(resolve => setTimeout(resolve, 500)); - log(bot, `Looking in the same direction as ${player_name}`); + await new Promise(resolve => setTimeout(resolve, RENDER_TIME)); + result = `Looking in the same direction as ${player_name}\n`; filename = await camera.capture(); } else { await bot.lookAt(new Vec3(player.position.x, player.position.y + player.height, player.position.z)); const camera = new Camera(bot, this.fp); - await new Promise(resolve => setTimeout(resolve, 500)); - log(bot, `Looking at player ${player_name}`); + await new Promise(resolve => setTimeout(resolve, RENDER_TIME)); + result = `Looking at player ${player_name}\n`; filename = await camera.capture(); + } - if (!this.allow_vision || !this.agent.prompter.vision_model.sendVisionRequest) { - log(this.agent.bot, "Vision is disabled. Using text-based environment description instead."); - log(this.agent.bot, this._nearbyBlocks()); - } else { - await this.analyzeImage(filename); - } + return result + `Image analysis: "${await this.analyzeImage(filename)}"`; } async lookAtPosition(x, y, z) { + if (!this.allow_vision || !this.agent.prompter.vision_model.sendVisionRequest) { + return "Vision is disabled. Use other methods to describe the environment."; + } + let result = ""; const bot = this.agent.bot; await bot.lookAt(new Vec3(x, y + 2, z)); const camera = new Camera(bot, this.fp); - await new Promise(resolve => setTimeout(resolve, 500)); - log(bot, `Looking at coordinate ${x, y, z}`); + await new Promise(resolve => setTimeout(resolve, RENDER_TIME)); + result = `Looking at coordinate ${x, y, z}\n`; let filename = await camera.capture(); - if (!this.allow_vision || !this.agent.prompter.vision_model.sendVisionRequest) { - log(this.agent.bot, "Vision is disabled. Using text-based environment description instead."); - log(this.agent.bot, this._nearbyBlocks()); - } else { - await this.analyzeImage(filename); - } + return result + `Image analysis: "${await this.analyzeImage(filename)}"`; } async analyzeImage(filename) { - let prompt = this.agent.prompter.profile.image_conversing; - let res = null; - try { - const bot = this.agent.bot; const imageBuffer = fs.readFileSync(`${this.fp}/${filename}.jpg`); const messages = this.agent.history.getHistory(); - res = await this.agent.prompter.vision_model.sendVisionRequest(messages, prompt, imageBuffer); - - if (res == 'Vision is only supported by certain models.') { - log(bot, "Vision may not be supported on this model. Using text-based environment description instead."); - log(bot, this._nearbyBlocks()); - } else { - log(bot, res); - } + + return await this.agent.prompter.promptVision(messages, imageBuffer); } catch (error) { - log(this.agent.bot, `Error analyzing image: ${error.message}`); + console.warn('Error reading image:', error); + return `Error reading image: ${error.message}`; } } - - _nearbyBlocks() { - const bot = this.agent.bot; - let res = 'NEARBY_BLOCKS'; - - let blocks = world.getNearbyBlockTypes(bot); - for (let i = 0; i < blocks.length; i++) { - res += `\n- ${blocks[i]}`; - } - if (blocks.length == 0) { - res += ': none'; - } else { - // Environmental Awareness - res += '\n- ' + world.getSurroundingBlocks(bot).join('\n- ') - res += `\n- First Solid Block Above Head: ${world.getFirstBlockAboveHead(bot, null, 32)}`; - } - return pad(res); - } } \ No newline at end of file diff --git a/src/models/prompter.js b/src/models/prompter.js index 5ac6a1f..786b623 100644 --- a/src/models/prompter.js +++ b/src/models/prompter.js @@ -343,6 +343,13 @@ git } return res.trim().toLowerCase() === 'respond'; } + async promptVision(messages, imageBuffer) { + await this.checkCooldown(); + let prompt = this.profile.image_analysis; + prompt = await this.replaceStrings(prompt, messages, null, null, null); + return await this.vision_model.sendVisionRequest(messages, prompt, imageBuffer); + } + async promptGoalSetting(messages, last_goals) { let system_message = this.profile.goal_setting; system_message = await this.replaceStrings(system_message, messages); From 222f988873b35bfcaa9d0ae93ded2a2f88e1c7e1 Mon Sep 17 00:00:00 2001 From: MaxRobinsonTheGreat Date: Wed, 5 Mar 2025 15:32:15 -0600 Subject: [PATCH 19/27] fixed merge mistake --- src/agent/agent.js | 75 ---------------------------------------------- 1 file changed, 75 deletions(-) diff --git a/src/agent/agent.js b/src/agent/agent.js index 03b8ec7..f658974 100644 --- a/src/agent/agent.js +++ b/src/agent/agent.js @@ -76,81 +76,6 @@ export class Agent { this.bot.chat(`/skin clear`); }); - const spawnTimeout = setTimeout(() => { - process.exit(0); - }, 30000); - this.bot.once('spawn', async () => { - try { - clearTimeout(spawnTimeout); - addViewer(this.bot, count_id); - - // wait for a bit so stats are not undefined - await new Promise((resolve) => setTimeout(resolve, 1000)); - - console.log(`${this.name} spawned.`); - this.clearBotLogs(); - - this._setupEventHandlers(save_data, init_message); - this.startEvents(); - - this.task.initBotTask(); - - } catch (error) { - console.error('Error in spawn event:', error); - process.exit(0); - } - }); - - console.log('Starting agent initialization with profile:', profile_fp); - - // Initialize components with more detailed error handling - console.log('Initializing action manager...'); - this.actions = new ActionManager(this); - console.log('Initializing prompter...'); - this.prompter = new Prompter(this, profile_fp); - this.name = this.prompter.getName(); - console.log('Initializing history...'); - this.history = new History(this); - console.log('Initializing coder...'); - this.coder = new Coder(this); - console.log('Initializing npc controller...'); - this.npc = new NPCContoller(this); - console.log('Initializing memory bank...'); - this.memory_bank = new MemoryBank(); - console.log('Initializing self prompter...'); - this.self_prompter = new SelfPrompter(this); - convoManager.initAgent(this); - console.log('Initializing examples...'); - await this.prompter.initExamples(); - console.log('Initializing task...'); - this.task = new Task(this, task_path, task_id); - const blocked_actions = this.task.blocked_actions || []; - blacklistCommands(blocked_actions); - - serverProxy.connect(this); - - console.log(this.name, 'logging into minecraft...'); - this.bot = initBot(this.name); - - initModes(this); - - let save_data = null; - if (load_mem) { - save_data = this.history.load(); - } - - this.bot.on('login', () => { - console.log(this.name, 'logged in!'); - - serverProxy.login(); - - // Set skin for profile, requires Fabric Tailor. (https://modrinth.com/mod/fabrictailor) - if (this.prompter.profile.skin) - this.bot.chat(`/skin set URL ${this.prompter.profile.skin.model} ${this.prompter.profile.skin.path}`); - else - this.bot.chat(`/skin clear`); - }); - const spawnTimeout = setTimeout(() => { process.exit(0); }, 30000); From 5dca9b778f6319bb9324077ec641d585683f278e Mon Sep 17 00:00:00 2001 From: MaxRobinsonTheGreat Date: Wed, 5 Mar 2025 15:35:50 -0600 Subject: [PATCH 20/27] readd canvas, remove random "git" --- package.json | 1 + src/models/prompter.js | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/package.json b/package.json index 0a56991..ac70774 100644 --- a/package.json +++ b/package.json @@ -5,6 +5,7 @@ "@google/generative-ai": "^0.2.1", "@huggingface/inference": "^2.8.1", "@mistralai/mistralai": "^1.1.0", + "canvas": "^3.1.0", "express": "^4.18.2", "google-translate-api-x": "^10.7.1", "groq-sdk": "^0.15.0", diff --git a/src/models/prompter.js b/src/models/prompter.js index 6ca26ea..b412ee5 100644 --- a/src/models/prompter.js +++ b/src/models/prompter.js @@ -339,7 +339,7 @@ export class Prompter { let resp = await this.code_model.sendRequest(messages, prompt); this.awaiting_coding = false; return resp; -git } + } async promptMemSaving(to_summarize) { await this.checkCooldown(); From dcdb7d2de1c1facba7040ebb08d508cd78afccd1 Mon Sep 17 00:00:00 2001 From: MaxRobinsonTheGreat Date: Wed, 5 Mar 2025 15:46:45 -0600 Subject: [PATCH 21/27] refactor/cleanup --- src/agent/agent.js | 2 +- src/agent/library/skills.js | 2 -- src/{utils => agent/vision}/camera.js | 0 src/agent/{ => vision}/vision_interpreter.js | 2 +- src/models/claude.js | 2 +- 5 files changed, 3 insertions(+), 5 deletions(-) rename src/{utils => agent/vision}/camera.js (100%) rename src/agent/{ => vision}/vision_interpreter.js (98%) diff --git a/src/agent/agent.js b/src/agent/agent.js index f658974..f186e52 100644 --- a/src/agent/agent.js +++ b/src/agent/agent.js @@ -1,6 +1,6 @@ import { History } from './history.js'; import { Coder } from './coder.js'; -import { VisionInterpreter } from './vision_interpreter.js'; +import { VisionInterpreter } from './vision/vision_interpreter.js'; import { Prompter } from '../models/prompter.js'; import { initModes } from './modes.js'; import { initBot } from '../utils/mcdata.js'; diff --git a/src/agent/library/skills.js b/src/agent/library/skills.js index 7e254dd..7700683 100644 --- a/src/agent/library/skills.js +++ b/src/agent/library/skills.js @@ -1,9 +1,7 @@ import * as mc from "../../utils/mcdata.js"; -import { Camera } from "../../utils/camera.js"; import * as world from "./world.js"; import pf from 'mineflayer-pathfinder'; import Vec3 from 'vec3'; -import fs from 'fs'; export function log(bot, message) { diff --git a/src/utils/camera.js b/src/agent/vision/camera.js similarity index 100% rename from src/utils/camera.js rename to src/agent/vision/camera.js diff --git a/src/agent/vision_interpreter.js b/src/agent/vision/vision_interpreter.js similarity index 98% rename from src/agent/vision_interpreter.js rename to src/agent/vision/vision_interpreter.js index 28c326e..34d9f3c 100644 --- a/src/agent/vision_interpreter.js +++ b/src/agent/vision/vision_interpreter.js @@ -1,5 +1,5 @@ import { Vec3 } from 'vec3'; -import { Camera } from "../utils/camera.js"; +import { Camera } from "./camera.js"; import fs from 'fs'; const RENDER_TIME = 1000; diff --git a/src/models/claude.js b/src/models/claude.js index 563dc88..d6e48bc 100644 --- a/src/models/claude.js +++ b/src/models/claude.js @@ -26,7 +26,7 @@ export class Claude { this.params.max_tokens = this.params.thinking.budget_tokens + 1000; // max_tokens must be greater than thinking.budget_tokens } else { - this.params.max_tokens = 16000; + this.params.max_tokens = 4096; } } const resp = await this.anthropic.messages.create({ From bdee71ac921b80a967f27daca643e26c4512ddc6 Mon Sep 17 00:00:00 2001 From: MaxRobinsonTheGreat Date: Fri, 7 Mar 2025 14:19:55 -0600 Subject: [PATCH 22/27] camera always on, show entities, refactor browser viewer --- profiles/claude_thinker.json | 2 +- settings.js | 2 +- src/agent/agent.js | 9 ++-- .../{viewer.js => vision/browser_viewer.js} | 4 +- src/agent/vision/camera.js | 50 +++++++++---------- src/agent/vision/vision_interpreter.js | 15 ++---- 6 files changed, 38 insertions(+), 44 deletions(-) rename src/agent/{viewer.js => vision/browser_viewer.js} (69%) diff --git a/profiles/claude_thinker.json b/profiles/claude_thinker.json index aab9f10..fdddb59 100644 --- a/profiles/claude_thinker.json +++ b/profiles/claude_thinker.json @@ -6,7 +6,7 @@ "params": { "thinking": { "type": "enabled", - "budget_tokens": 16000 + "budget_tokens": 4000 } } }, diff --git a/settings.js b/settings.js index 1ae05a3..d3899b4 100644 --- a/settings.js +++ b/settings.js @@ -1,6 +1,6 @@ export default { - "minecraft_version": "1.20.4", // supports up to 1.21.1 + "minecraft_version": "1.21.1", // supports up to 1.21.1 "host": "127.0.0.1", // or "localhost", "your.ip.address.here" "port": process.env.MINECRAFT_PORT || 55916, "auth": "offline", // or "microsoft" diff --git a/src/agent/agent.js b/src/agent/agent.js index f186e52..fb123f6 100644 --- a/src/agent/agent.js +++ b/src/agent/agent.js @@ -11,7 +11,7 @@ import { MemoryBank } from './memory_bank.js'; import { SelfPrompter } from './self_prompter.js'; import convoManager from './conversation.js'; import { handleTranslation, handleEnglishTranslation } from '../utils/translator.js'; -import { addViewer } from './viewer.js'; +import { addBrowserViewer } from './vision/browser_viewer.js'; import settings from '../../settings.js'; import { serverProxy } from './agent_proxy.js'; import { Task } from './tasks.js'; @@ -36,8 +36,6 @@ export class Agent { this.history = new History(this); console.log('Initializing coder...'); this.coder = new Coder(this); - console.log('Initializing vision intepreter...'); - this.vision_interpreter = new VisionInterpreter(this, settings.allow_vision); console.log('Initializing npc controller...'); this.npc = new NPCContoller(this); console.log('Initializing memory bank...'); @@ -82,7 +80,7 @@ export class Agent { this.bot.once('spawn', async () => { try { clearTimeout(spawnTimeout); - addViewer(this.bot, count_id); + addBrowserViewer(this.bot, count_id); // wait for a bit so stats are not undefined await new Promise((resolve) => setTimeout(resolve, 1000)); @@ -97,6 +95,9 @@ export class Agent { this.task.initBotTask(); } + console.log('Initializing vision intepreter...'); + this.vision_interpreter = new VisionInterpreter(this, settings.allow_vision); + } catch (error) { console.error('Error in spawn event:', error); process.exit(0); diff --git a/src/agent/viewer.js b/src/agent/vision/browser_viewer.js similarity index 69% rename from src/agent/viewer.js rename to src/agent/vision/browser_viewer.js index 6ce8a27..9ae7c7b 100644 --- a/src/agent/viewer.js +++ b/src/agent/vision/browser_viewer.js @@ -1,8 +1,8 @@ -import settings from '../../settings.js'; +import settings from '../../../settings.js'; import prismarineViewer from 'prismarine-viewer'; const mineflayerViewer = prismarineViewer.mineflayer; -export function addViewer(bot, count_id) { +export function addBrowserViewer(bot, count_id) { if (settings.show_bot_views) mineflayerViewer(bot, { port: 3000+count_id, firstPerson: true, }); } \ No newline at end of file diff --git a/src/agent/vision/camera.js b/src/agent/vision/camera.js index 7eafb42..c6d9487 100644 --- a/src/agent/vision/camera.js +++ b/src/agent/vision/camera.js @@ -14,37 +14,37 @@ global.Worker = worker_threads.Worker; export class Camera extends EventEmitter { constructor (bot, fp) { - super() - this.bot = bot - this.fp = fp - this.viewDistance = 4 - this.width = 800 - this.height = 512 - this.canvas = createCanvas(this.width, this.height) - this.renderer = new THREE.WebGLRenderer({ canvas: this.canvas }) - this.viewer = new Viewer(this.renderer) - this._init().then(() => { - this.emit('ready') - }) + super(); + this.bot = bot; + this.fp = fp; + this.viewDistance = 4; + this.width = 800; + this.height = 512; + this.canvas = createCanvas(this.width, this.height); + this.renderer = new THREE.WebGLRenderer({ canvas: this.canvas }); + this.viewer = new Viewer(this.renderer); + this._init().then(() => { + this.emit('ready'); + }) } async _init () { - const botPos = this.bot.entity.position - const center = new Vec3(botPos.x, botPos.y+this.bot.entity.height, botPos.z) - this.viewer.setVersion(this.bot.version) - // Load world - const worldView = new WorldView(this.bot.world, this.viewDistance, center) - this.viewer.listen(worldView) - - this.viewer.camera.position.set(center.x, center.y, center.z) - this.viewer.setFirstPersonCamera(this.bot.entity.position, this.bot.entity.yaw, this.bot.entity.pitch) - - await worldView.init(center) + const botPos = this.bot.entity.position; + const center = new Vec3(botPos.x, botPos.y+this.bot.entity.height, botPos.z); + this.viewer.setVersion(this.bot.version); + // Load world + const worldView = new WorldView(this.bot.world, this.viewDistance, center); + this.viewer.listen(worldView); + worldView.listenToBot(this.bot); + await worldView.init(center); + this.worldView = worldView; } async capture() { - // waits some time helps renderer to render the world view - await new Promise(resolve => setTimeout(resolve, 1000)); + const center = new Vec3(this.bot.entity.position.x, this.bot.entity.position.y+this.bot.entity.height, this.bot.entity.position.z); + this.viewer.camera.position.set(center.x, center.y, center.z); + this.viewer.setFirstPersonCamera(this.bot.entity.position, this.bot.entity.yaw, this.bot.entity.pitch); + this.viewer.update(); this.renderer.render(this.viewer.scene, this.viewer.camera); const imageStream = this.canvas.createJPEGStream({ diff --git a/src/agent/vision/vision_interpreter.js b/src/agent/vision/vision_interpreter.js index 34d9f3c..2c03276 100644 --- a/src/agent/vision/vision_interpreter.js +++ b/src/agent/vision/vision_interpreter.js @@ -2,13 +2,12 @@ import { Vec3 } from 'vec3'; import { Camera } from "./camera.js"; import fs from 'fs'; -const RENDER_TIME = 1000; - export class VisionInterpreter { constructor(agent, allow_vision) { this.agent = agent; this.allow_vision = allow_vision; this.fp = './bots/'+agent.name+'/screenshots/'; + this.camera = new Camera(agent.bot, this.fp); } async lookAtPlayer(player_name, direction) { @@ -25,16 +24,12 @@ export class VisionInterpreter { let filename; if (direction === 'with') { await bot.look(player.yaw, player.pitch); - const camera = new Camera(bot, this.fp); - await new Promise(resolve => setTimeout(resolve, RENDER_TIME)); result = `Looking in the same direction as ${player_name}\n`; - filename = await camera.capture(); + filename = await this.camera.capture(); } else { await bot.lookAt(new Vec3(player.position.x, player.position.y + player.height, player.position.z)); - const camera = new Camera(bot, this.fp); - await new Promise(resolve => setTimeout(resolve, RENDER_TIME)); result = `Looking at player ${player_name}\n`; - filename = await camera.capture(); + filename = await this.camera.capture(); } @@ -48,11 +43,9 @@ export class VisionInterpreter { let result = ""; const bot = this.agent.bot; await bot.lookAt(new Vec3(x, y + 2, z)); - const camera = new Camera(bot, this.fp); - await new Promise(resolve => setTimeout(resolve, RENDER_TIME)); result = `Looking at coordinate ${x, y, z}\n`; - let filename = await camera.capture(); + let filename = await this.camera.capture(); return result + `Image analysis: "${await this.analyzeImage(filename)}"`; } From c13bf4895da386a0ff106ade87eac8af7c43adfc Mon Sep 17 00:00:00 2001 From: MaxRobinsonTheGreat Date: Fri, 7 Mar 2025 23:18:49 -0600 Subject: [PATCH 23/27] improve prompt/examples --- profiles/defaults/_default.json | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/profiles/defaults/_default.json b/profiles/defaults/_default.json index b06b249..ab63863 100644 --- a/profiles/defaults/_default.json +++ b/profiles/defaults/_default.json @@ -9,7 +9,7 @@ "bot_responder": "You are a minecraft bot named $NAME that is currently in conversation with another AI bot. Both of you can take actions with the !command syntax, and actions take time to complete. You are currently busy with the following action: '$ACTION' but have received a new message. Decide whether to 'respond' immediately or 'ignore' it and wait for your current action to finish. Be conservative and only respond when necessary, like when you need to change/stop your action, or convey necessary information. Example 1: You:Building a house! !newAction('Build a house.').\nOther Bot: 'Come here!'\nYour decision: ignore\nExample 2: You:Collecting dirt !collectBlocks('dirt',10).\nOther Bot: 'No, collect some wood instead.'\nYour decision: respond\nExample 3: You:Coming to you now. !goToPlayer('billy',3).\nOther Bot: 'What biome are you in?'\nYour decision: respond\nActual Conversation: $TO_SUMMARIZE\nDecide by outputting ONLY 'respond' or 'ignore', nothing else. Your decision:", - "image_analysis": "You are a Minecraft bot named $NAME that has been given a screenshot of your current view. Analyze and summarize the view; describe terrain, blocks, structures, and notable features. Focus on details relevant to the conversation. Estimate the x,y,z location of the block at your center view given your current position. Be extremely concise and correct, respond only with your analysis, not conversationally. $STATS", + "image_analysis": "You are a Minecraft bot named $NAME that has been given a screenshot of your current view. Analyze and summarize the view; describe terrain, blocks, entities, structures, and notable features. Focus on details relevant to the conversation. Note the view is glitchy; the sky color is always blue regardless of weather or time, dropped items are pink cubes, blocks below y=0 do not render. Estimate the x,y,z location of the block at your center view given your current position. Be extremely concise and correct, respond only with your analysis, not conversationally. $STATS", "modes": { "self_preservation": true, @@ -71,19 +71,13 @@ [ {"role": "user", "content": "d7297: look here"}, {"role": "assistant", "content": "Okay! !lookAtPlayer(\"d7297\", \"at\")"}, - {"role": "system", "content": "Code Output:\nLooking at player d7297. There is a giant waterfall behind d7297."}, + {"role": "system", "content": "Image analysis:\nLooking at player d7297. There is a giant waterfall behind d7297."}, {"role": "assistant", "content": "Woo, that is a cool waterfall!"} ], - [ - {"role": "user", "content": "d7297: look there"}, - {"role": "assistant", "content": "Okay! !lookAtPlayer(\"d7297\", \"with\")"}, - {"role": "system", "content": "Code Output:\nLooking in the same direction as d7297\nThere is a castle made of stone."}, - {"role": "assistant", "content": "Wow, that stone castle looks amazing!"} - ], [ {"role": "user", "content": "d7297: look 0 70 12 and describe what is there"}, {"role": "assistant", "content": "Sure! !lookAtPosition(0, 70, 12)"}, - {"role": "system", "content": "Code Output:\nLooking in the same direction as d7297\nThere is a water fountain."}, + {"role": "system", "content": "Image analysis:\nLooking at position 0, 70, 12. There is a water fountain."}, {"role": "assistant", "content": "I see the water fountain! amazing!"} ], From 39abf8a19e899c49a38188d14ec0ad434ca3ca10 Mon Sep 17 00:00:00 2001 From: MaxRobinsonTheGreat Date: Fri, 7 Mar 2025 23:19:12 -0600 Subject: [PATCH 24/27] update worldview pos, extend view range --- src/agent/vision/camera.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/agent/vision/camera.js b/src/agent/vision/camera.js index c6d9487..e8a07fb 100644 --- a/src/agent/vision/camera.js +++ b/src/agent/vision/camera.js @@ -17,7 +17,7 @@ export class Camera extends EventEmitter { super(); this.bot = bot; this.fp = fp; - this.viewDistance = 4; + this.viewDistance = 12; this.width = 800; this.height = 512; this.canvas = createCanvas(this.width, this.height); @@ -43,6 +43,7 @@ export class Camera extends EventEmitter { async capture() { const center = new Vec3(this.bot.entity.position.x, this.bot.entity.position.y+this.bot.entity.height, this.bot.entity.position.z); this.viewer.camera.position.set(center.x, center.y, center.z); + await this.worldView.updatePosition(center); this.viewer.setFirstPersonCamera(this.bot.entity.position, this.bot.entity.yaw, this.bot.entity.pitch); this.viewer.update(); this.renderer.render(this.viewer.scene, this.viewer.camera); From 4cd5a8f658784008f72da1107d8e57242d34a47e Mon Sep 17 00:00:00 2001 From: MaxRobinsonTheGreat Date: Fri, 7 Mar 2025 23:19:41 -0600 Subject: [PATCH 25/27] patch viewer to so not constantly throwing errors --- patches/prismarine-viewer+1.33.0.patch | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 patches/prismarine-viewer+1.33.0.patch diff --git a/patches/prismarine-viewer+1.33.0.patch b/patches/prismarine-viewer+1.33.0.patch new file mode 100644 index 0000000..3ef7a5a --- /dev/null +++ b/patches/prismarine-viewer+1.33.0.patch @@ -0,0 +1,13 @@ +diff --git a/node_modules/prismarine-viewer/viewer/lib/entity/Entity.js b/node_modules/prismarine-viewer/viewer/lib/entity/Entity.js +index 8945452..dab25be 100644 +--- a/node_modules/prismarine-viewer/viewer/lib/entity/Entity.js ++++ b/node_modules/prismarine-viewer/viewer/lib/entity/Entity.js +@@ -203,7 +203,7 @@ function getMesh (texture, jsonModel) { + class Entity { + constructor (version, type, scene) { + const e = entities[type] +- if (!e) throw new Error(`Unknown entity ${type}`) ++ if (!e) return; //throw new Error(`Unknown entity ${type}`) + + this.mesh = new THREE.Object3D() + for (const [name, jsonModel] of Object.entries(e.geometry)) { From c5b860d6249dd43c8e21e61e2e0a641d924524fa Mon Sep 17 00:00:00 2001 From: MaxRobinsonTheGreat Date: Sat, 15 Mar 2025 17:25:11 -0500 Subject: [PATCH 26/27] fix busted up groq --- src/models/groq.js | 61 ++++++++++++++++++---------------------------- 1 file changed, 24 insertions(+), 37 deletions(-) diff --git a/src/models/groq.js b/src/models/groq.js index b8dfe6b..e601137 100644 --- a/src/models/groq.js +++ b/src/models/groq.js @@ -27,50 +27,38 @@ export class GroqCloudAPI { } - async sendRequest(turns, systemMessage, stop_seq = null) { - // Variables for DeepSeek-R1 models - const maxAttempts = 5; - let attempt = 0; - let finalRes = null; - let res = null; + async sendRequest(turns, systemMessage, stop_seq = null) { + // Construct messages array + let messages = [{"role": "system", "content": systemMessage}].concat(turns); - // Construct messages array - let messages = [{"role": "system", "content": systemMessage}].concat(turns); + let res = null; - while (attempt < maxAttempts) { - attempt++; + try { + console.log("Awaiting Groq response..."); - // These variables look odd, but they're for the future. - let raw_res = null; - let tool_calls = null; + // Handle deprecated max_tokens parameter + if (this.params.max_tokens) { + console.warn("GROQCLOUD WARNING: A profile is using `max_tokens`. This is deprecated. Please move to `max_completion_tokens`."); + this.params.max_completion_tokens = this.params.max_tokens; + delete this.params.max_tokens; + } - try { - console.log("Awaiting Groq response..."); + if (!this.params.max_completion_tokens) { + this.params.max_completion_tokens = 4000; + } - // Handle deprecated max_tokens parameter - if (this.params.max_tokens) { - console.warn("GROQCLOUD WARNING: A profile is using `max_tokens`. This is deprecated. Please move to `max_completion_tokens`."); - this.params.max_completion_tokens = this.params.max_tokens; - delete this.params.max_tokens; - } + let completion = await this.groq.chat.completions.create({ + "messages": messages, + "model": this.model_name || "llama-3.3-70b-versatile", + "stream": false, + "stop": stop_seq, + ...(this.params || {}) + }); - if (!this.params.max_completion_tokens) { - this.params.max_completion_tokens = 8000; // Set it lower. - } - - let completion = await this.groq.chat.completions.create({ - "messages": messages, - "model": this.model_name || "llama-3.3-70b-versatile", - "stream": false, - "stop": stop_seq, - ...(this.params || {}) - }); - - raw_res = completion.choices[0].message; - res = raw_res.content; + res = completion.choices[0].message; + res = res.replace(/[\s\S]*?<\/think>/g, '').trim(); } - catch(err) { if (err.message.includes("content must be a string")) { res = "Vision is only supported by certain models."; @@ -80,7 +68,6 @@ export class GroqCloudAPI { } console.log(err); } - return res; } From 2c1ff9e77d5ca34d31649fb32efa83fec7de6215 Mon Sep 17 00:00:00 2001 From: MaxRobinsonTheGreat Date: Sat, 15 Mar 2025 18:10:31 -0500 Subject: [PATCH 27/27] get actual center block view/coords --- profiles/defaults/_default.json | 2 +- src/agent/vision/vision_interpreter.js | 22 +++++++++++++++++++--- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/profiles/defaults/_default.json b/profiles/defaults/_default.json index ab63863..fc2b60e 100644 --- a/profiles/defaults/_default.json +++ b/profiles/defaults/_default.json @@ -9,7 +9,7 @@ "bot_responder": "You are a minecraft bot named $NAME that is currently in conversation with another AI bot. Both of you can take actions with the !command syntax, and actions take time to complete. You are currently busy with the following action: '$ACTION' but have received a new message. Decide whether to 'respond' immediately or 'ignore' it and wait for your current action to finish. Be conservative and only respond when necessary, like when you need to change/stop your action, or convey necessary information. Example 1: You:Building a house! !newAction('Build a house.').\nOther Bot: 'Come here!'\nYour decision: ignore\nExample 2: You:Collecting dirt !collectBlocks('dirt',10).\nOther Bot: 'No, collect some wood instead.'\nYour decision: respond\nExample 3: You:Coming to you now. !goToPlayer('billy',3).\nOther Bot: 'What biome are you in?'\nYour decision: respond\nActual Conversation: $TO_SUMMARIZE\nDecide by outputting ONLY 'respond' or 'ignore', nothing else. Your decision:", - "image_analysis": "You are a Minecraft bot named $NAME that has been given a screenshot of your current view. Analyze and summarize the view; describe terrain, blocks, entities, structures, and notable features. Focus on details relevant to the conversation. Note the view is glitchy; the sky color is always blue regardless of weather or time, dropped items are pink cubes, blocks below y=0 do not render. Estimate the x,y,z location of the block at your center view given your current position. Be extremely concise and correct, respond only with your analysis, not conversationally. $STATS", + "image_analysis": "You are a Minecraft bot named $NAME that has been given a screenshot of your current view. Analyze and summarize the view; describe terrain, blocks, entities, structures, and notable features. Focus on details relevant to the conversation. Note: the sky is always blue regardless of weather or time, dropped items are small pink cubes, and blocks below y=0 do not render. Be extremely concise and correct, respond only with your analysis, not conversationally. $STATS", "modes": { "self_preservation": true, diff --git a/src/agent/vision/vision_interpreter.js b/src/agent/vision/vision_interpreter.js index 2c03276..a43acd2 100644 --- a/src/agent/vision/vision_interpreter.js +++ b/src/agent/vision/vision_interpreter.js @@ -7,7 +7,9 @@ export class VisionInterpreter { this.agent = agent; this.allow_vision = allow_vision; this.fp = './bots/'+agent.name+'/screenshots/'; - this.camera = new Camera(agent.bot, this.fp); + if (allow_vision) { + this.camera = new Camera(agent.bot, this.fp); + } } async lookAtPlayer(player_name, direction) { @@ -43,19 +45,33 @@ export class VisionInterpreter { let result = ""; const bot = this.agent.bot; await bot.lookAt(new Vec3(x, y + 2, z)); - result = `Looking at coordinate ${x, y, z}\n`; + result = `Looking at coordinate ${x}, ${y}, ${z}\n`; let filename = await this.camera.capture(); return result + `Image analysis: "${await this.analyzeImage(filename)}"`; } + getCenterBlockInfo() { + const bot = this.agent.bot; + const maxDistance = 128; // Maximum distance to check for blocks + const targetBlock = bot.blockAtCursor(maxDistance); + + if (targetBlock) { + return `Block at center view: ${targetBlock.name} at (${targetBlock.position.x}, ${targetBlock.position.y}, ${targetBlock.position.z})`; + } else { + return "No block in center view"; + } + } + async analyzeImage(filename) { try { const imageBuffer = fs.readFileSync(`${this.fp}/${filename}.jpg`); const messages = this.agent.history.getHistory(); - return await this.agent.prompter.promptVision(messages, imageBuffer); + const blockInfo = this.getCenterBlockInfo(); + const result = await this.agent.prompter.promptVision(messages, imageBuffer); + return result + `\n${blockInfo}`; } catch (error) { console.warn('Error reading image:', error);