add gemini TTS

2025-08-30 19:03:03 +02:00 · 2025-08-23 13:18:42 +01:00 · 2025-08-23 13:18:42 +01:00 · 3557215e29
commit 3557215e29
parent b027f1e345
4 changed files with 85 additions and 3 deletions
--- a/package.json
+++ b/package.json
@ -11,6 +11,7 @@
        "google-translate-api-x": "^10.7.1",
        "groq-sdk": "^0.15.0",
        "install": "^0.13.0",
+        "lamejs": "^1.2.1",
        "minecraft-data": "^3.78.0",
        "mineflayer": "^4.29.0",
        "mineflayer-armor-manager": "^2.0.1",
@ -21,7 +22,6 @@
        "node-canvas-webgl": "PrismarineJS/node-canvas-webgl",
        "npm": "^11.5.2",
        "openai": "^4.4.0",
-        "patch-package": "^8.0.0",
        "prismarine-item": "^1.15.0",
        "prismarine-viewer": "^1.32.0",
        "replicate": "^0.29.4",
@ -40,6 +40,7 @@
        "@eslint/js": "^9.13.0",
        "eslint": "^9.13.0",
        "eslint-plugin-no-floating-promise": "^2.0.0",
-        "globals": "^15.11.0"
+        "globals": "^15.11.0",
+        "patch-package": "^8.0.0"
    }
 }
--- a/patches/lamejs+1.2.1.patch
+++ b/patches/lamejs+1.2.1.patch
@ -0,0 +1,21 @@
+diff --git a/node_modules/lamejs/lame.all.js b/node_modules/lamejs/lame.all.js
+index bfd3637..b905508 100644
+--- a/node_modules/lamejs/lame.all.js
+++ b/node_modules/lamejs/lame.all.js
+@@ -1,4 +1,3 @@
+-function lamejs() {
+ function new_byte(count) {
+     return new Int8Array(count);
+ }
+@@ -15511,8 +15510,9 @@ WavHeader.readHeader = function (dataView) {
+ 
+ L3Side.SFBMAX = (Encoder.SBMAX_s * 3);
+ //testFullLength();
+export var lamejs = {}
+ lamejs.Mp3Encoder = Mp3Encoder;
+ lamejs.WavHeader = WavHeader;
+-}
+
+ //fs=require('fs');
+-lamejs();
+//lamejs();
--- a/src/agent/speak.js
+++ b/src/agent/speak.js
@ -1,6 +1,7 @@
 import { exec, spawn } from 'child_process';
 import { TTSConfig as pollinationsTTSConfig } from '../models/pollinations.js';
 import { TTSConfig as gptTTSConfig } from '../models/gpt.js';
+import { TTSConfig as geminiTTSConfig } from '../models/gemini.js';

 let speakingQueue = [];
 let isSpeaking = false;
@ -44,6 +45,8 @@ $s.Speak('${txt.replace(/'/g,"''")}'); $s.Dispose()"`
        return pollinationsTTSConfig.baseUrl;
      } else if (prov === 'openai') {
        return gptTTSConfig.baseUrl;
+      } else if (prov === 'google') {
+        return geminiTTSConfig.baseUrl;
      } else {
        // fallback
        return 'https://api.openai.com/v1'
@ -68,6 +71,8 @@ $s.Speak('${txt.replace(/'/g,"''")}'); $s.Dispose()"`
        audioData = await pollinationsTTSConfig.sendAudioRequest(txt, mdl, voice, url);
      } else if (prov === "openai") {
        audioData = await gptTTSConfig.sendAudioRequest(txt, mdl, voice, url);
+      } else if (prov === "google") {
+        audioData = await geminiTTSConfig.sendAudioRequest(txt, mdl, voice, url);
      } else {
        throw new Error(`TTS Provider ${prov} is not supported.`);
      }
--- a/src/models/gemini.js
+++ b/src/models/gemini.js
@ -1,7 +1,10 @@
-import { GoogleGenAI } from '@google/genai';
+import { GoogleGenAI, VideoCompressionQuality } from '@google/genai';
 import { toSinglePrompt, strictFormat } from '../utils/text.js';
 import { getKey } from '../utils/keys.js';

+import { lamejs } from 'lamejs/lame.all.js';
+
+
 export class Gemini {
    constructor(model, url, params) {
        this.model = model || "gemini-2.5-flash";
@ -116,3 +119,55 @@ export class Gemini {
        return result.embeddings;
    }
 }
+
+const sendAudioRequest = async (text, model, voice, url) => {
+    const ai = new GoogleGenAI({apiKey: getKey('GEMINI_API_KEY')});
+
+    const response = await ai.models.generateContent({
+        model: model,
+        contents: [{ parts: [{text: text}] }],
+        config: {
+            responseModalities: ['AUDIO'],
+            speechConfig: {
+                voiceConfig: {
+                    prebuiltVoiceConfig: { voiceName: voice },
+                },
+            },
+        },
+    })
+
+    const data = response.candidates?.[0]?.content?.parts?.[0]?.inlineData?.data;
+    // data is base64-encoded pcm
+
+    // convert pcm to mp3
+    const SAMPLE_RATE = 24000;
+    const CHANNELS = 1;
+    const pcmBuffer = Buffer.from(data, 'base64');
+    const pcmInt16Array = new Int16Array(
+        pcmBuffer.buffer, 
+        pcmBuffer.byteOffset, 
+        pcmBuffer.length / 2
+    );
+    const mp3encoder = new lamejs.Mp3Encoder(CHANNELS, SAMPLE_RATE, 128);
+    const sampleBlockSize = 1152; // Standard for MPEG audio
+    const mp3Data = [];
+    for (let i = 0; i < pcmInt16Array.length; i += sampleBlockSize) {
+        const sampleChunk = pcmInt16Array.subarray(i, i + sampleBlockSize);
+        const mp3buf = mp3encoder.encodeBuffer(sampleChunk);
+        if (mp3buf.length > 0) {
+            mp3Data.push(Buffer.from(mp3buf));
+        }
+    }
+    const mp3buf = mp3encoder.flush();
+    if (mp3buf.length > 0) {
+        mp3Data.push(Buffer.from(mp3buf));
+    }
+    const finalBuffer = Buffer.concat(mp3Data);
+    // finished converting
+
+    return finalBuffer.toString('base64');
+}
+
+export const TTSConfig = {
+    sendAudioRequest: sendAudioRequest,
+}