mirror of
https://github.com/kolbytn/mindcraft.git
synced 2025-07-01 06:05:19 +02:00
![google-labs-jules[bot]](/assets/img/avatar_default.png)
This update finalizes the implementation of three distinct vision modes: - "off": This disables all my vision capabilities. - "prompted": (Formerly "on") This allows me to use vision via explicit commands from you (e.g., !lookAtPlayer), and I will then summarize the image. - "always": (Formerly "always_active") I will automatically take a screenshot every time you send a prompt and send it with your prompt to a multimodal LLM. If you use a look command in this mode, I will only update my view and take a screenshot for the *next* interaction if relevant, without immediate summarization. Here are the key changes and improvements: 1. **Bug Fix (Image Path ENOENT)**: * I've corrected `Camera.capture()` so it returns filenames with the `.jpg` extension. * I've updated `VisionInterpreter.analyzeImage()` to handle full filenames. * This resolves the `ENOENT` error that was previously happening in `Prompter.js`. 2. **Vision Mode Renaming**: * I've renamed the modes in `settings.js` and throughout the codebase: "on" is now "prompted", and "always_active" is now "always". 3. **Core Framework (from previous work, now integrated)**: * I've added `vision_mode` to `settings.js`. * `Agent.js` now manages `latestScreenshotPath` and initializes `VisionInterpreter` with `vision_mode`. * `VisionInterpreter.js` handles different behaviors for each mode. * My vision commands (`!lookAt...`) respect the `off` mode. * `History.js` stores `imagePath` with turns, and `Agent.js` manages this path's lifecycle. * `Prompter.js` reads image files when I'm in "always" mode and passes `imageData` to model wrappers. 4. **Extended Multimodal API Support**: * `gemini.js`, `gpt.js`, `claude.js`, `local.js` (Ollama), `qwen.js`, and `deepseek.js` have been updated to accept `imageData` in their `sendRequest` method and format it for their respective multimodal APIs. They now include `supportsRawImageInput = true`. * Other model wrappers (`mistral.js`, `glhf.js`, `grok.js`, etc.) now safely handle the `imageData` parameter in `sendRequest` (by ignoring it and logging a warning) and have `supportsRawImageInput = false` for that method, ensuring consistent behavior. 5. **Testing**: I have a comprehensive plan to verify all modes and functionalities. This set of changes provides a robust and flexible vision system for me, catering to different operational needs and supporting various multimodal LLMs.
75 lines
2.5 KiB
JavaScript
75 lines
2.5 KiB
JavaScript
import OpenAIApi from 'openai';
|
|
import { getKey } from '../utils/keys.js';
|
|
import { strictFormat } from '../utils/text.js';
|
|
|
|
// llama, mistral
|
|
export class Novita {
|
|
constructor(model_name, url, params) {
|
|
this.model_name = model_name.replace('novita/', '');
|
|
this.url = url || 'https://api.novita.ai/v3/openai';
|
|
this.params = params;
|
|
|
|
|
|
let config = {
|
|
baseURL: this.url
|
|
};
|
|
config.apiKey = getKey('NOVITA_API_KEY');
|
|
|
|
this.openai = new OpenAIApi(config);
|
|
// Direct image data in sendRequest is not supported by this wrapper.
|
|
this.supportsRawImageInput = false;
|
|
}
|
|
|
|
async sendRequest(turns, systemMessage, imageData = null, stop_seq='***') {
|
|
if (imageData) {
|
|
console.warn(`[Novita] Warning: imageData provided to sendRequest, but this method in novita.js does not support direct image data embedding for model ${this.model_name}. The image will be ignored.`);
|
|
}
|
|
let messages = [{'role': 'system', 'content': systemMessage}].concat(turns);
|
|
|
|
|
|
messages = strictFormat(messages);
|
|
|
|
const pack = {
|
|
model: this.model_name || "meta-llama/llama-3.1-70b-instruct",
|
|
messages,
|
|
stop: [stop_seq],
|
|
...(this.params || {})
|
|
};
|
|
|
|
let res = null;
|
|
try {
|
|
console.log('Awaiting novita api response...')
|
|
let completion = await this.openai.chat.completions.create(pack);
|
|
if (completion.choices[0].finish_reason == 'length')
|
|
throw new Error('Context length exceeded');
|
|
console.log('Received.')
|
|
res = completion.choices[0].message.content;
|
|
}
|
|
catch (err) {
|
|
if ((err.message == 'Context length exceeded' || err.code == 'context_length_exceeded') && turns.length > 1) {
|
|
console.log('Context length exceeded, trying again with shorter context.');
|
|
return await this.sendRequest(turns.slice(1), systemMessage, imageData, stop_seq); // Added this. and imageData
|
|
} else {
|
|
console.log(err);
|
|
res = 'My brain disconnected, try again.';
|
|
}
|
|
}
|
|
if (res.includes('<think>')) {
|
|
let start = res.indexOf('<think>');
|
|
let end = res.indexOf('</think>') + 8;
|
|
if (start != -1) {
|
|
if (end != -1) {
|
|
res = res.substring(0, start) + res.substring(end);
|
|
} else {
|
|
res = res.substring(0, start+7);
|
|
}
|
|
}
|
|
res = res.trim();
|
|
}
|
|
return res;
|
|
}
|
|
|
|
async embed(text) {
|
|
throw new Error('Embeddings are not supported by Novita AI.');
|
|
}
|
|
}
|