mindcraft/src/models/gemini.js

import { GoogleGenerativeAI } from '@google/generative-ai';
import { toSinglePrompt, strictFormat } from '../utils/text.js';
import { getKey } from '../utils/keys.js';

export class Gemini {
    constructor(model_name, url, params) {
        this.model_name = model_name;
        this.params = params;
        this.url = url;
        this.safetySettings = [
            {
                "category": "HARM_CATEGORY_DANGEROUS",
                "threshold": "BLOCK_NONE",
            },
            {
                "category": "HARM_CATEGORY_HARASSMENT",
                "threshold": "BLOCK_NONE",
            },
            {
                "category": "HARM_CATEGORY_HATE_SPEECH",
                "threshold": "BLOCK_NONE",
            },
            {
                "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
                "threshold": "BLOCK_NONE",
            },
            {
                "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
                "threshold": "BLOCK_NONE",
            },
        ];

        this.genAI = new GoogleGenerativeAI(getKey('GEMINI_API_KEY'));
        this.supportsRawImageInput = true;
    }

    async sendRequest(turns, systemMessage, imageData = null) {
        let model;
        const modelConfig = {
            model: this.model_name || "gemini-1.5-flash",
            // systemInstruction does not work bc google is trash
        };
        if (this.url) {
            model = this.genAI.getGenerativeModel(
                modelConfig,
                { baseUrl: this.url },
                { safetySettings: this.safetySettings }
            );
        } else {
            model = this.genAI.getGenerativeModel(
                modelConfig,
                { safetySettings: this.safetySettings }
            );
        }

        console.log('Awaiting Google API response...');

        turns.unshift({ role: 'system', content: systemMessage });
        turns = strictFormat(turns);
        let contents = [];
        for (let turn of turns) {
            contents.push({
                role: turn.role === 'assistant' ? 'model' : 'user',
                parts: [{ text: turn.content }]
            });
        }

        if (imageData && contents.length > 0) {
            const lastContent = contents[contents.length - 1];
            if (lastContent.role === 'user') { // Ensure the image is added to a user turn
                lastContent.parts.push({
                    inline_data: {
                        mime_type: 'image/jpeg',
                        data: imageData.toString('base64')
                    }
                });
            } else {
                // This case should ideally not happen if imageData is tied to a user message.
                // If it does, we could append a new user turn with the image,
                // or log a warning and send without the image.
                // For now, let's assume the last message is the user's if imageData is present.
                console.warn('[Gemini] imageData provided, but the last content entry was not from a user. Image not sent.');
            }
        }

        const result = await model.generateContent({
            contents,
            generationConfig: {
                ...(this.params || {})
            }
        });
        const response = await result.response;
        let text;

        // Handle "thinking" models since they smart
        if (this.model_name && this.model_name.includes("thinking")) {
            if (
                response.candidates &&
                response.candidates.length > 0 &&
                response.candidates[0].content &&
                response.candidates[0].content.parts &&
                response.candidates[0].content.parts.length > 1
            ) {
                text = response.candidates[0].content.parts[1].text;
            } else {
                console.warn("Unexpected response structure for thinking model:", response);
                text = response.text();
            }
        } else {
            text = response.text();
        }

        console.log('Received.');

        return text;
    }

    async sendVisionRequest(turns, systemMessage, imageBuffer) {
        let model;
        if (this.url) {
            model = this.genAI.getGenerativeModel(
                { model: this.model_name || "gemini-1.5-flash" },
                { baseUrl: this.url },
                { safetySettings: this.safetySettings }
            );
        } else {
            model = this.genAI.getGenerativeModel(
                { model: this.model_name || "gemini-1.5-flash" },
                { safetySettings: this.safetySettings }
            );
        }

        const imagePart = {
            inlineData: {
                data: imageBuffer.toString('base64'),
                mimeType: 'image/jpeg'
            }
        };

        const stop_seq = '***';
        const prompt = toSinglePrompt(turns, systemMessage, stop_seq, 'model');
        let res = null;
        try {
            console.log('Awaiting Google API vision response...');
            const result = await model.generateContent([prompt, imagePart]);
            const response = await result.response;
            const text = response.text();
            console.log('Received.');
            if (!text.includes(stop_seq)) return text;
            const idx = text.indexOf(stop_seq);
            res = text.slice(0, idx);
        } catch (err) {
            console.log(err);
            if (err.message.includes("Image input modality is not enabled for models/")) {
                res = "Vision is only supported by certain models.";
            } else {
                res = "An unexpected error occurred, please try again.";
            }
        }
        return res;
    }

    async embed(text) {
        let model;
        if (this.url) {
            model = this.genAI.getGenerativeModel(
                { model: "text-embedding-004" },
                { baseUrl: this.url }
            );
        } else {
            model = this.genAI.getGenerativeModel(
                { model: "text-embedding-004" }
            );
        }

        const result = await model.embedContent(text);
        return result.embedding.values;
    }
}