diff --git a/package.json b/package.json index f9fd695..d059ef3 100644 --- a/package.json +++ b/package.json @@ -11,6 +11,7 @@ "google-translate-api-x": "^10.7.1", "groq-sdk": "^0.15.0", "install": "^0.13.0", + "lamejs": "^1.2.1", "minecraft-data": "^3.78.0", "mineflayer": "^4.29.0", "mineflayer-armor-manager": "^2.0.1", @@ -21,7 +22,6 @@ "node-canvas-webgl": "PrismarineJS/node-canvas-webgl", "npm": "^11.5.2", "openai": "^4.4.0", - "patch-package": "^8.0.0", "prismarine-item": "^1.15.0", "prismarine-viewer": "^1.32.0", "replicate": "^0.29.4", @@ -40,6 +40,7 @@ "@eslint/js": "^9.13.0", "eslint": "^9.13.0", "eslint-plugin-no-floating-promise": "^2.0.0", - "globals": "^15.11.0" + "globals": "^15.11.0", + "patch-package": "^8.0.0" } } diff --git a/patches/lamejs+1.2.1.patch b/patches/lamejs+1.2.1.patch new file mode 100644 index 0000000..8950598 --- /dev/null +++ b/patches/lamejs+1.2.1.patch @@ -0,0 +1,21 @@ +diff --git a/node_modules/lamejs/lame.all.js b/node_modules/lamejs/lame.all.js +index bfd3637..b905508 100644 +--- a/node_modules/lamejs/lame.all.js ++++ b/node_modules/lamejs/lame.all.js +@@ -1,4 +1,3 @@ +-function lamejs() { + function new_byte(count) { + return new Int8Array(count); + } +@@ -15511,8 +15510,9 @@ WavHeader.readHeader = function (dataView) { + + L3Side.SFBMAX = (Encoder.SBMAX_s * 3); + //testFullLength(); ++export var lamejs = {} + lamejs.Mp3Encoder = Mp3Encoder; + lamejs.WavHeader = WavHeader; +-} ++ + //fs=require('fs'); +-lamejs(); ++//lamejs(); diff --git a/src/agent/speak.js b/src/agent/speak.js index 5b9fb03..4a1c363 100644 --- a/src/agent/speak.js +++ b/src/agent/speak.js @@ -1,6 +1,7 @@ import { exec, spawn } from 'child_process'; import { TTSConfig as pollinationsTTSConfig } from '../models/pollinations.js'; import { TTSConfig as gptTTSConfig } from '../models/gpt.js'; +import { TTSConfig as geminiTTSConfig } from '../models/gemini.js'; let speakingQueue = []; let isSpeaking = false; @@ -44,6 +45,8 @@ $s.Speak('${txt.replace(/'/g,"''")}'); $s.Dispose()"` return pollinationsTTSConfig.baseUrl; } else if (prov === 'openai') { return gptTTSConfig.baseUrl; + } else if (prov === 'google') { + return geminiTTSConfig.baseUrl; } else { // fallback return 'https://api.openai.com/v1' @@ -68,6 +71,8 @@ $s.Speak('${txt.replace(/'/g,"''")}'); $s.Dispose()"` audioData = await pollinationsTTSConfig.sendAudioRequest(txt, mdl, voice, url); } else if (prov === "openai") { audioData = await gptTTSConfig.sendAudioRequest(txt, mdl, voice, url); + } else if (prov === "google") { + audioData = await geminiTTSConfig.sendAudioRequest(txt, mdl, voice, url); } else { throw new Error(`TTS Provider ${prov} is not supported.`); } diff --git a/src/models/gemini.js b/src/models/gemini.js index 9177e3a..cda583b 100644 --- a/src/models/gemini.js +++ b/src/models/gemini.js @@ -1,7 +1,10 @@ -import { GoogleGenAI } from '@google/genai'; +import { GoogleGenAI, VideoCompressionQuality } from '@google/genai'; import { toSinglePrompt, strictFormat } from '../utils/text.js'; import { getKey } from '../utils/keys.js'; +import { lamejs } from 'lamejs/lame.all.js'; + + export class Gemini { constructor(model, url, params) { this.model = model || "gemini-2.5-flash"; @@ -116,3 +119,55 @@ export class Gemini { return result.embeddings; } } + +const sendAudioRequest = async (text, model, voice, url) => { + const ai = new GoogleGenAI({apiKey: getKey('GEMINI_API_KEY')}); + + const response = await ai.models.generateContent({ + model: model, + contents: [{ parts: [{text: text}] }], + config: { + responseModalities: ['AUDIO'], + speechConfig: { + voiceConfig: { + prebuiltVoiceConfig: { voiceName: voice }, + }, + }, + }, + }) + + const data = response.candidates?.[0]?.content?.parts?.[0]?.inlineData?.data; + // data is base64-encoded pcm + + // convert pcm to mp3 + const SAMPLE_RATE = 24000; + const CHANNELS = 1; + const pcmBuffer = Buffer.from(data, 'base64'); + const pcmInt16Array = new Int16Array( + pcmBuffer.buffer, + pcmBuffer.byteOffset, + pcmBuffer.length / 2 + ); + const mp3encoder = new lamejs.Mp3Encoder(CHANNELS, SAMPLE_RATE, 128); + const sampleBlockSize = 1152; // Standard for MPEG audio + const mp3Data = []; + for (let i = 0; i < pcmInt16Array.length; i += sampleBlockSize) { + const sampleChunk = pcmInt16Array.subarray(i, i + sampleBlockSize); + const mp3buf = mp3encoder.encodeBuffer(sampleChunk); + if (mp3buf.length > 0) { + mp3Data.push(Buffer.from(mp3buf)); + } + } + const mp3buf = mp3encoder.flush(); + if (mp3buf.length > 0) { + mp3Data.push(Buffer.from(mp3buf)); + } + const finalBuffer = Buffer.concat(mp3Data); + // finished converting + + return finalBuffer.toString('base64'); +} + +export const TTSConfig = { + sendAudioRequest: sendAudioRequest, +} \ No newline at end of file