diff --git a/src/agent/vision_interpreter.js b/src/agent/vision_interpreter.js index 94be3de..9c66cc8 100644 --- a/src/agent/vision_interpreter.js +++ b/src/agent/vision_interpreter.js @@ -71,7 +71,14 @@ export class VisionInterpreter { const imageBuffer = fs.readFileSync(`${this.fp}/${filename}.jpg`); const messages = this.agent.history.getHistory(); res = await this.agent.prompter.vision_model.sendVisionRequest(messages, prompt, imageBuffer); - log(bot, res); + + if (res == 'Vision is only supported by certain models.') { + log(bot, "Vision may not be supported on this model. Using text-based environment description instead."); + log(bot, this._nearbyBlocks()); + } else { + log(bot, res); + } + } catch (error) { log(this.agent.bot, `Error analyzing image: ${error.message}`); } diff --git a/src/models/claude.js b/src/models/claude.js index 236a0bf..74095ef 100644 --- a/src/models/claude.js +++ b/src/models/claude.js @@ -35,8 +35,12 @@ export class Claude { res = resp.content[0].text; } catch (err) { + if (err.message.includes("does not support image input")) { + res = "Vision is only supported by certain models."; + } else { + res = "My brain disconnected, try again."; + } console.log(err); - res = 'My brain disconnected, try again.'; } return res; } diff --git a/src/models/gemini.js b/src/models/gemini.js index bc17a57..4c35526 100644 --- a/src/models/gemini.js +++ b/src/models/gemini.js @@ -102,15 +102,25 @@ export class Gemini { const stop_seq = '***'; const prompt = toSinglePrompt(turns, systemMessage, stop_seq, 'model'); - - console.log('Awaiting Google API vision response...'); - const result = await model.generateContent([prompt, imagePart]); - const response = await result.response; - const text = response.text(); - console.log('Received.'); - if (!text.includes(stop_seq)) return text; - const idx = text.indexOf(stop_seq); - return text.slice(0, idx); + let res = null; + try { + console.log('Awaiting Google API vision response...'); + const result = await model.generateContent([prompt, imagePart]); + const response = await result.response; + const text = response.text(); + console.log('Received.'); + if (!text.includes(stop_seq)) return text; + const idx = text.indexOf(stop_seq); + res = text.slice(0, idx); + } catch (err) { + console.log(err); + if (err.message.includes("Image input modality is not enabled for models/")) { + res = "Vision is only supported by certain models."; + } else { + res = "An unexpected error occurred, please try again."; + } + } + return res; } async embed(text) { diff --git a/src/models/gpt.js b/src/models/gpt.js index 981f0b2..8540778 100644 --- a/src/models/gpt.js +++ b/src/models/gpt.js @@ -48,6 +48,9 @@ export class GPT { if ((err.message == 'Context length exceeded' || err.code == 'context_length_exceeded') && turns.length > 1) { console.log('Context length exceeded, trying again with shorter context.'); return await this.sendRequest(turns.slice(1), systemMessage, stop_seq); + } else if (err.message.includes('image_url')) { + console.log(err); + res = 'Vision is only supported by certain models.'; } else { console.log(err); res = 'My brain disconnected, try again.'; diff --git a/src/models/grok.js b/src/models/grok.js index a8c6672..2878a10 100644 --- a/src/models/grok.js +++ b/src/models/grok.js @@ -43,6 +43,9 @@ export class Grok { if ((err.message == 'Context length exceeded' || err.code == 'context_length_exceeded') && turns.length > 1) { console.log('Context length exceeded, trying again with shorter context.'); return await this.sendRequest(turns.slice(1), systemMessage, stop_seq); + } else if (err.message.includes('The model expects a single `text` element per message.')) { + console.log(err); + res = 'Vision is only supported by certain models.'; } else { console.log(err); res = 'My brain disconnected, try again.'; @@ -51,6 +54,24 @@ export class Grok { // sometimes outputs special token <|separator|>, just replace it return res.replace(/<\|separator\|>/g, '*no response*'); } + + async sendVisionRequest(messages, systemMessage, imageBuffer) { + const imageMessages = [...messages]; + imageMessages.push({ + role: "user", + content: [ + { type: "text", text: systemMessage }, + { + type: "image_url", + image_url: { + url: `data:image/jpeg;base64,${imageBuffer.toString('base64')}` + } + } + ] + }); + + return this.sendRequest(imageMessages, systemMessage); + } async embed(text) { throw new Error('Embeddings are not supported by Grok.'); diff --git a/src/models/groq.js b/src/models/groq.js index 0ec99f6..4c8d1e6 100644 --- a/src/models/groq.js +++ b/src/models/groq.js @@ -23,9 +23,6 @@ export class GroqCloudAPI { let res = null; try { console.log("Awaiting Groq response..."); - if (!this.params.max_tokens) { - this.params.max_tokens = 16384; - } let completion = await this.groq.chat.completions.create({ "messages": messages, "model": this.model_name || "mixtral-8x7b-32768", @@ -43,14 +40,19 @@ export class GroqCloudAPI { } catch(err) { + if (err.message.includes("content must be a string")) { + res = "Vision is only supported by certain models."; + } else { + console.log(this.model_name); + res = "My brain disconnected, try again."; + } console.log(err); - res = "My brain just kinda stopped working. Try again."; } return res; } async sendVisionRequest(messages, systemMessage, imageBuffer) { - const imageMessages = [...messages]; + const imageMessages = messages.filter(message => message.role !== 'system'); imageMessages.push({ role: "user", content: [ diff --git a/src/models/mistral.js b/src/models/mistral.js index f1f3563..72448f1 100644 --- a/src/models/mistral.js +++ b/src/models/mistral.js @@ -56,9 +56,12 @@ export class Mistral { result = response.choices[0].message.content; } catch (err) { - console.log(err) - - result = "My brain disconnected, try again."; + if (err.message.includes("A request containing images has been given to a model which does not have the 'vision' capability.")) { + result = "Vision is only supported by certain models."; + } else { + result = "My brain disconnected, try again."; + } + console.log(err); } return result;