mindcraft/src/models/groq.js
google-labs-jules[bot] be38f56f12 I've implemented enhanced vision modes with bug fixes and extended API support.
This update finalizes the implementation of three distinct vision modes:
- "off": This disables all my vision capabilities.
- "prompted": (Formerly "on") This allows me to use vision via explicit commands from you (e.g., !lookAtPlayer), and I will then summarize the image.
- "always": (Formerly "always_active") I will automatically take a screenshot every time you send a prompt and send it with your prompt to a multimodal LLM. If you use a look command in this mode, I will only update my view and take a screenshot for the *next* interaction if relevant, without immediate summarization.

Here are the key changes and improvements:

1.  **Bug Fix (Image Path ENOENT)**:
    *   I've corrected `Camera.capture()` so it returns filenames with the `.jpg` extension.
    *   I've updated `VisionInterpreter.analyzeImage()` to handle full filenames.
    *   This resolves the `ENOENT` error that was previously happening in `Prompter.js`.

2.  **Vision Mode Renaming**:
    *   I've renamed the modes in `settings.js` and throughout the codebase: "on" is now "prompted", and "always_active" is now "always".

3.  **Core Framework (from previous work, now integrated)**:
    *   I've added `vision_mode` to `settings.js`.
    *   `Agent.js` now manages `latestScreenshotPath` and initializes `VisionInterpreter` with `vision_mode`.
    *   `VisionInterpreter.js` handles different behaviors for each mode.
    *   My vision commands (`!lookAt...`) respect the `off` mode.
    *   `History.js` stores `imagePath` with turns, and `Agent.js` manages this path's lifecycle.
    *   `Prompter.js` reads image files when I'm in "always" mode and passes `imageData` to model wrappers.

4.  **Extended Multimodal API Support**:
    *   `gemini.js`, `gpt.js`, `claude.js`, `local.js` (Ollama), `qwen.js`, and `deepseek.js` have been updated to accept `imageData` in their `sendRequest` method and format it for their respective multimodal APIs. They now include `supportsRawImageInput = true`.
    *   Other model wrappers (`mistral.js`, `glhf.js`, `grok.js`, etc.) now safely handle the `imageData` parameter in `sendRequest` (by ignoring it and logging a warning) and have `supportsRawImageInput = false` for that method, ensuring consistent behavior.

5.  **Testing**: I have a comprehensive plan to verify all modes and functionalities.

This set of changes provides a robust and flexible vision system for me, catering to different operational needs and supporting various multimodal LLMs.
2025-06-07 09:07:02 +00:00

101 lines
3.7 KiB
JavaScript

import Groq from 'groq-sdk'
import { getKey } from '../utils/keys.js';
// THIS API IS NOT TO BE CONFUSED WITH GROK!
// Go to grok.js for that. :)
// Umbrella class for everything under the sun... That GroqCloud provides, that is.
export class GroqCloudAPI {
constructor(model_name, url, params) {
this.model_name = model_name;
this.url = url;
this.params = params || {};
// Remove any mention of "tools" from params:
if (this.params.tools)
delete this.params.tools;
// This is just a bit of future-proofing in case we drag Mindcraft in that direction.
// I'm going to do a sneaky ReplicateAPI theft for a lot of this, aren't I?
if (this.url)
console.warn("Groq Cloud has no implementation for custom URLs. Ignoring provided URL.");
this.groq = new Groq({ apiKey: getKey('GROQCLOUD_API_KEY') });
// Direct image data in sendRequest is not supported by this wrapper.
// Groq may offer specific vision models/APIs, but this standard chat method assumes text.
this.supportsRawImageInput = false;
}
async sendRequest(turns, systemMessage, imageData = null, stop_seq = null) {
if (imageData) {
console.warn(`[Groq] Warning: imageData provided to sendRequest, but this method in groq.js does not support direct image data embedding for model ${this.model_name}. The image will be ignored.`);
}
// Construct messages array
let messages = [{"role": "system", "content": systemMessage}].concat(turns);
let res = null;
try {
console.log("Awaiting Groq response...");
// Handle deprecated max_tokens parameter
if (this.params.max_tokens) {
console.warn("GROQCLOUD WARNING: A profile is using `max_tokens`. This is deprecated. Please move to `max_completion_tokens`.");
this.params.max_completion_tokens = this.params.max_tokens;
delete this.params.max_tokens;
}
if (!this.params.max_completion_tokens) {
this.params.max_completion_tokens = 4000;
}
let completion = await this.groq.chat.completions.create({
"messages": messages,
"model": this.model_name || "llama-3.3-70b-versatile",
"stream": false,
"stop": stop_seq,
...(this.params || {})
});
res = completion.choices[0].message;
res = res.replace(/<think>[\s\S]*?<\/think>/g, '').trim();
}
catch(err) {
if (err.message.includes("content must be a string")) {
res = "Vision is only supported by certain models.";
} else {
console.log(this.model_name);
res = "My brain disconnected, try again.";
}
console.log(err);
}
return res;
}
async sendVisionRequest(messages, systemMessage, imageBuffer) {
const imageMessages = messages.filter(message => message.role !== 'system');
imageMessages.push({
role: "user",
content: [
{ type: "text", text: systemMessage },
{
type: "image_url",
image_url: {
url: `data:image/jpeg;base64,${imageBuffer.toString('base64')}`
}
}
]
});
// sendVisionRequest formats its own message array; sendRequest here should not process new imageData.
return this.sendRequest(imageMessages, systemMessage, null, stop_seq);
}
async embed(_) {
throw new Error('Embeddings are not supported by Groq.');
}
}