mirror of
https://github.com/kolbytn/mindcraft.git
synced 2025-07-01 06:05:19 +02:00
![google-labs-jules[bot]](/assets/img/avatar_default.png)
This update finalizes the implementation of three distinct vision modes: - "off": This disables all my vision capabilities. - "prompted": (Formerly "on") This allows me to use vision via explicit commands from you (e.g., !lookAtPlayer), and I will then summarize the image. - "always": (Formerly "always_active") I will automatically take a screenshot every time you send a prompt and send it with your prompt to a multimodal LLM. If you use a look command in this mode, I will only update my view and take a screenshot for the *next* interaction if relevant, without immediate summarization. Here are the key changes and improvements: 1. **Bug Fix (Image Path ENOENT)**: * I've corrected `Camera.capture()` so it returns filenames with the `.jpg` extension. * I've updated `VisionInterpreter.analyzeImage()` to handle full filenames. * This resolves the `ENOENT` error that was previously happening in `Prompter.js`. 2. **Vision Mode Renaming**: * I've renamed the modes in `settings.js` and throughout the codebase: "on" is now "prompted", and "always_active" is now "always". 3. **Core Framework (from previous work, now integrated)**: * I've added `vision_mode` to `settings.js`. * `Agent.js` now manages `latestScreenshotPath` and initializes `VisionInterpreter` with `vision_mode`. * `VisionInterpreter.js` handles different behaviors for each mode. * My vision commands (`!lookAt...`) respect the `off` mode. * `History.js` stores `imagePath` with turns, and `Agent.js` manages this path's lifecycle. * `Prompter.js` reads image files when I'm in "always" mode and passes `imageData` to model wrappers. 4. **Extended Multimodal API Support**: * `gemini.js`, `gpt.js`, `claude.js`, `local.js` (Ollama), `qwen.js`, and `deepseek.js` have been updated to accept `imageData` in their `sendRequest` method and format it for their respective multimodal APIs. They now include `supportsRawImageInput = true`. * Other model wrappers (`mistral.js`, `glhf.js`, `grok.js`, etc.) now safely handle the `imageData` parameter in `sendRequest` (by ignoring it and logging a warning) and have `supportsRawImageInput = false` for that method, ensuring consistent behavior. 5. **Testing**: I have a comprehensive plan to verify all modes and functionalities. This set of changes provides a robust and flexible vision system for me, catering to different operational needs and supporting various multimodal LLMs.
101 lines
3.7 KiB
JavaScript
101 lines
3.7 KiB
JavaScript
import Groq from 'groq-sdk'
|
|
import { getKey } from '../utils/keys.js';
|
|
|
|
// THIS API IS NOT TO BE CONFUSED WITH GROK!
|
|
// Go to grok.js for that. :)
|
|
|
|
// Umbrella class for everything under the sun... That GroqCloud provides, that is.
|
|
export class GroqCloudAPI {
|
|
|
|
constructor(model_name, url, params) {
|
|
|
|
this.model_name = model_name;
|
|
this.url = url;
|
|
this.params = params || {};
|
|
|
|
// Remove any mention of "tools" from params:
|
|
if (this.params.tools)
|
|
delete this.params.tools;
|
|
// This is just a bit of future-proofing in case we drag Mindcraft in that direction.
|
|
|
|
// I'm going to do a sneaky ReplicateAPI theft for a lot of this, aren't I?
|
|
if (this.url)
|
|
console.warn("Groq Cloud has no implementation for custom URLs. Ignoring provided URL.");
|
|
|
|
this.groq = new Groq({ apiKey: getKey('GROQCLOUD_API_KEY') });
|
|
// Direct image data in sendRequest is not supported by this wrapper.
|
|
// Groq may offer specific vision models/APIs, but this standard chat method assumes text.
|
|
this.supportsRawImageInput = false;
|
|
|
|
}
|
|
|
|
async sendRequest(turns, systemMessage, imageData = null, stop_seq = null) {
|
|
if (imageData) {
|
|
console.warn(`[Groq] Warning: imageData provided to sendRequest, but this method in groq.js does not support direct image data embedding for model ${this.model_name}. The image will be ignored.`);
|
|
}
|
|
// Construct messages array
|
|
let messages = [{"role": "system", "content": systemMessage}].concat(turns);
|
|
|
|
let res = null;
|
|
|
|
try {
|
|
console.log("Awaiting Groq response...");
|
|
|
|
// Handle deprecated max_tokens parameter
|
|
if (this.params.max_tokens) {
|
|
console.warn("GROQCLOUD WARNING: A profile is using `max_tokens`. This is deprecated. Please move to `max_completion_tokens`.");
|
|
this.params.max_completion_tokens = this.params.max_tokens;
|
|
delete this.params.max_tokens;
|
|
}
|
|
|
|
if (!this.params.max_completion_tokens) {
|
|
this.params.max_completion_tokens = 4000;
|
|
}
|
|
|
|
let completion = await this.groq.chat.completions.create({
|
|
"messages": messages,
|
|
"model": this.model_name || "llama-3.3-70b-versatile",
|
|
"stream": false,
|
|
"stop": stop_seq,
|
|
...(this.params || {})
|
|
});
|
|
|
|
res = completion.choices[0].message;
|
|
|
|
res = res.replace(/<think>[\s\S]*?<\/think>/g, '').trim();
|
|
}
|
|
catch(err) {
|
|
if (err.message.includes("content must be a string")) {
|
|
res = "Vision is only supported by certain models.";
|
|
} else {
|
|
console.log(this.model_name);
|
|
res = "My brain disconnected, try again.";
|
|
}
|
|
console.log(err);
|
|
}
|
|
return res;
|
|
}
|
|
|
|
async sendVisionRequest(messages, systemMessage, imageBuffer) {
|
|
const imageMessages = messages.filter(message => message.role !== 'system');
|
|
imageMessages.push({
|
|
role: "user",
|
|
content: [
|
|
{ type: "text", text: systemMessage },
|
|
{
|
|
type: "image_url",
|
|
image_url: {
|
|
url: `data:image/jpeg;base64,${imageBuffer.toString('base64')}`
|
|
}
|
|
}
|
|
]
|
|
});
|
|
|
|
// sendVisionRequest formats its own message array; sendRequest here should not process new imageData.
|
|
return this.sendRequest(imageMessages, systemMessage, null, stop_seq);
|
|
}
|
|
|
|
async embed(_) {
|
|
throw new Error('Embeddings are not supported by Groq.');
|
|
}
|
|
}
|