fix: use text description when vision features are used with a non-vision model

This commit is contained in:
gmuffiness 2025-02-10 02:03:25 +09:00
parent 647655f206
commit 430ae24d20
7 changed files with 69 additions and 19 deletions

View file

@ -71,7 +71,14 @@ export class VisionInterpreter {
const imageBuffer = fs.readFileSync(`${this.fp}/${filename}.jpg`);
const messages = this.agent.history.getHistory();
res = await this.agent.prompter.vision_model.sendVisionRequest(messages, prompt, imageBuffer);
log(bot, res);
if (res == 'Vision is only supported by certain models.') {
log(bot, "Vision may not be supported on this model. Using text-based environment description instead.");
log(bot, this._nearbyBlocks());
} else {
log(bot, res);
}
} catch (error) {
log(this.agent.bot, `Error analyzing image: ${error.message}`);
}

View file

@ -35,8 +35,12 @@ export class Claude {
res = resp.content[0].text;
}
catch (err) {
if (err.message.includes("does not support image input")) {
res = "Vision is only supported by certain models.";
} else {
res = "My brain disconnected, try again.";
}
console.log(err);
res = 'My brain disconnected, try again.';
}
return res;
}

View file

@ -102,15 +102,25 @@ export class Gemini {
const stop_seq = '***';
const prompt = toSinglePrompt(turns, systemMessage, stop_seq, 'model');
console.log('Awaiting Google API vision response...');
const result = await model.generateContent([prompt, imagePart]);
const response = await result.response;
const text = response.text();
console.log('Received.');
if (!text.includes(stop_seq)) return text;
const idx = text.indexOf(stop_seq);
return text.slice(0, idx);
let res = null;
try {
console.log('Awaiting Google API vision response...');
const result = await model.generateContent([prompt, imagePart]);
const response = await result.response;
const text = response.text();
console.log('Received.');
if (!text.includes(stop_seq)) return text;
const idx = text.indexOf(stop_seq);
res = text.slice(0, idx);
} catch (err) {
console.log(err);
if (err.message.includes("Image input modality is not enabled for models/")) {
res = "Vision is only supported by certain models.";
} else {
res = "An unexpected error occurred, please try again.";
}
}
return res;
}
async embed(text) {

View file

@ -48,6 +48,9 @@ export class GPT {
if ((err.message == 'Context length exceeded' || err.code == 'context_length_exceeded') && turns.length > 1) {
console.log('Context length exceeded, trying again with shorter context.');
return await this.sendRequest(turns.slice(1), systemMessage, stop_seq);
} else if (err.message.includes('image_url')) {
console.log(err);
res = 'Vision is only supported by certain models.';
} else {
console.log(err);
res = 'My brain disconnected, try again.';

View file

@ -43,6 +43,9 @@ export class Grok {
if ((err.message == 'Context length exceeded' || err.code == 'context_length_exceeded') && turns.length > 1) {
console.log('Context length exceeded, trying again with shorter context.');
return await this.sendRequest(turns.slice(1), systemMessage, stop_seq);
} else if (err.message.includes('The model expects a single `text` element per message.')) {
console.log(err);
res = 'Vision is only supported by certain models.';
} else {
console.log(err);
res = 'My brain disconnected, try again.';
@ -51,6 +54,24 @@ export class Grok {
// sometimes outputs special token <|separator|>, just replace it
return res.replace(/<\|separator\|>/g, '*no response*');
}
async sendVisionRequest(messages, systemMessage, imageBuffer) {
const imageMessages = [...messages];
imageMessages.push({
role: "user",
content: [
{ type: "text", text: systemMessage },
{
type: "image_url",
image_url: {
url: `data:image/jpeg;base64,${imageBuffer.toString('base64')}`
}
}
]
});
return this.sendRequest(imageMessages, systemMessage);
}
async embed(text) {
throw new Error('Embeddings are not supported by Grok.');

View file

@ -23,9 +23,6 @@ export class GroqCloudAPI {
let res = null;
try {
console.log("Awaiting Groq response...");
if (!this.params.max_tokens) {
this.params.max_tokens = 16384;
}
let completion = await this.groq.chat.completions.create({
"messages": messages,
"model": this.model_name || "mixtral-8x7b-32768",
@ -43,14 +40,19 @@ export class GroqCloudAPI {
}
catch(err) {
if (err.message.includes("content must be a string")) {
res = "Vision is only supported by certain models.";
} else {
console.log(this.model_name);
res = "My brain disconnected, try again.";
}
console.log(err);
res = "My brain just kinda stopped working. Try again.";
}
return res;
}
async sendVisionRequest(messages, systemMessage, imageBuffer) {
const imageMessages = [...messages];
const imageMessages = messages.filter(message => message.role !== 'system');
imageMessages.push({
role: "user",
content: [

View file

@ -56,9 +56,12 @@ export class Mistral {
result = response.choices[0].message.content;
} catch (err) {
console.log(err)
result = "My brain disconnected, try again.";
if (err.message.includes("A request containing images has been given to a model which does not have the 'vision' capability.")) {
result = "Vision is only supported by certain models.";
} else {
result = "My brain disconnected, try again.";
}
console.log(err);
}
return result;