diff --git a/src/agent/agent.js b/src/agent/agent.js index d69433d..6f8851e 100644 --- a/src/agent/agent.js +++ b/src/agent/agent.js @@ -15,7 +15,7 @@ import { addBrowserViewer } from './vision/browser_viewer.js'; import { serverProxy, sendOutputToServer } from './mindserver_proxy.js'; import settings from './settings.js'; import { Task } from './tasks/tasks.js'; -import { say } from './speak.js'; +import { speak } from './speak.js'; export class Agent { async start(load_mem=false, init_message=null, count_id=0) { @@ -384,7 +384,7 @@ export class Agent { } else { if (settings.speak) { - say(to_translate, this.prompter.profile.speak_model); + speak(to_translate, this.prompter.profile.speak_model); } if (settings.chat_ingame) {this.bot.chat(message);} sendOutputToServer(this.name, message); diff --git a/src/agent/speak.js b/src/agent/speak.js index a68735b..003655e 100644 --- a/src/agent/speak.js +++ b/src/agent/speak.js @@ -1,107 +1,150 @@ import { exec, spawn } from 'child_process'; +import { promises as fs } from 'fs'; +import os from 'os'; +import path from 'path'; import { TTSConfig as gptTTSConfig } from '../models/gpt.js'; import { TTSConfig as geminiTTSConfig } from '../models/gemini.js'; -let speakingQueue = []; +let speakingQueue = []; // each item: {text, model, audioData, ready} let isSpeaking = false; -export function say(text, speak_model) { - speakingQueue.push([text, speak_model]); - if (!isSpeaking) processQueue(); +export function speak(text, speak_model) { + const model = speak_model || 'system'; + + const item = { text, model, audioData: null, ready: null }; + + if (model === 'system') { + // no preprocessing needed + item.ready = Promise.resolve(); + } else { + item.ready = fetchRemoteAudio(text, model) + .then(data => { item.audioData = data; }) + .catch(err => { item.error = err; }); + } + + speakingQueue.push(item); + if (!isSpeaking) processQueue(); +} + +async function fetchRemoteAudio(txt, model) { + function getModelUrl(prov) { + if (prov === 'openai') return gptTTSConfig.baseUrl; + if (prov === 'google') return geminiTTSConfig.baseUrl; + return 'https://api.openai.com/v1'; + } + + let prov, mdl, voice, url; + if (typeof model === 'string') { + [prov, mdl, voice] = model.split('/'); + url = getModelUrl(prov); + } else { + prov = model.api; + mdl = model.model; + voice = model.voice; + url = model.url || getModelUrl(prov); + } + + if (prov === 'openai') { + return gptTTSConfig.sendAudioRequest(txt, mdl, voice, url); + } else if (prov === 'google') { + return geminiTTSConfig.sendAudioRequest(txt, mdl, voice, url); + } + else { + throw new Error(`TTS Provider ${prov} is not supported.`); + } } async function processQueue() { - if (speakingQueue.length === 0) { - isSpeaking = false; - return; - } - isSpeaking = true; - const [txt, speak_model] = speakingQueue.shift(); - - const isWin = process.platform === 'win32'; - const isMac = process.platform === 'darwin'; - const model = speak_model || 'openai/tts-1/echo'; - - if (model === 'system') { - // system TTS - const cmd = isWin - ? `powershell -NoProfile -Command "Add-Type -AssemblyName System.Speech; \ -$s=New-Object System.Speech.Synthesis.SpeechSynthesizer; $s.Rate=2; \ -$s.Speak('${txt.replace(/'/g,"''")}'); $s.Dispose()"` - : isMac - ? `say "${txt.replace(/"/g,'\\"')}"` - : `espeak "${txt.replace(/"/g,'\\"')}"`; - - exec(cmd, err => { - if (err) console.error('TTS error', err); - processQueue(); - }); - - } else { - - function getModelUrl(prov) { - if (prov === 'openai') { - return gptTTSConfig.baseUrl; - } else if (prov === 'google') { - return geminiTTSConfig.baseUrl; - } else { - // fallback - return 'https://api.openai.com/v1' - } + isSpeaking = true; + if (speakingQueue.length === 0) { + isSpeaking = false; + return; } - - // remote audio provider - let prov, mdl, voice, url; - if (typeof model === "string") { - [prov, mdl, voice] = model.split('/'); - url = getModelUrl(prov); - } else { - prov = model.api; - mdl = model.model; - voice = model.voice; - url = model.url || getModelUrl(prov); - } - - try { - let audioData; - if (prov === "openai") { - audioData = await gptTTSConfig.sendAudioRequest(txt, mdl, voice, url); - } else if (prov === "google") { - audioData = await geminiTTSConfig.sendAudioRequest(txt, mdl, voice, url); - } else { - throw new Error(`TTS Provider ${prov} is not supported.`); - } - - if (!audioData) { - throw new Error("TTS model did not return audio data"); - // will be handled below - } - - if (isWin) { - const ps = ` - Add-Type -AssemblyName presentationCore; - $p=New-Object System.Windows.Media.MediaPlayer; - $p.Open([Uri]::new("data:audio/mp3;base64,${audioData}")); - $p.Play(); - Start-Sleep -Seconds [math]::Ceiling($p.NaturalDuration.TimeSpan.TotalSeconds); - `; - spawn('powershell', ['-NoProfile','-Command', ps], { - stdio: 'ignore', detached: true - }).unref(); + const item = speakingQueue.shift(); + const { text: txt, model, audioData } = item; + if (txt.trim() === '') { + isSpeaking = false; processQueue(); - - } else { - const player = spawn('ffplay', ['-nodisp','-autoexit','pipe:0'], { - stdio: ['pipe','ignore','ignore'] - }); - player.stdin.write(Buffer.from(audioData, 'base64')); - player.stdin.end(); - player.on('exit', processQueue); - } - - } catch (e) { - console.error('[TTS] Audio error', e); - processQueue(); + return; + } + + const isWin = process.platform === 'win32'; + const isMac = process.platform === 'darwin'; + + // wait for preprocessing if needed + try { + await item.ready; + if (item.error) throw item.error; + } catch (err) { + console.error('[TTS] preprocess error', err); + isSpeaking = false; + processQueue(); + return; + } + + if (model === 'system') { + // system TTS + const cmd = isWin + ? `powershell -NoProfile -Command "Add-Type -AssemblyName System.Speech; \ + $s=New-Object System.Speech.Synthesis.SpeechSynthesizer; $s.Rate=2; \ + $s.Speak('${txt.replace(/'/g,"''")}'); $s.Dispose()"` + : isMac + ? `say "${txt.replace(/"/g,'\\"')}"` + : `espeak "${txt.replace(/"/g,'\\"')}"`; + + exec(cmd, err => { + if (err) console.error('TTS error', err); + isSpeaking = false; + processQueue(); + }); + + } + else { + // audioData was already fetched in speak() + const audioData = item.audioData; + + if (!audioData) { + console.error('[TTS] No audio data ready'); + isSpeaking = false; + processQueue(); + return; + } + + try { + if (isWin) { + const tmpPath = path.join(os.tmpdir(), `tts_${Date.now()}.mp3`); + await fs.writeFile(tmpPath, Buffer.from(audioData, 'base64')); + + const player = spawn('ffplay', ['-nodisp', '-autoexit', '-loglevel', 'quiet', tmpPath], { + stdio: 'ignore', windowsHide: true + }); + player.on('error', async (err) => { + console.error('[TTS] ffplay error', err); + try { await fs.unlink(tmpPath); } catch {} + isSpeaking = false; + processQueue(); + }); + player.on('exit', async () => { + try { await fs.unlink(tmpPath); } catch {} + isSpeaking = false; + processQueue(); + }); + + } else { + const player = spawn('ffplay', ['-nodisp','-autoexit','pipe:0'], { + stdio: ['pipe','ignore','ignore'] + }); + player.stdin.write(Buffer.from(audioData, 'base64')); + player.stdin.end(); + player.on('exit', () => { + isSpeaking = false; + processQueue(); + }); + } + } catch (e) { + console.error('[TTS] Audio error', e); + isSpeaking = false; + processQueue(); + } } - } } diff --git a/src/models/gpt.js b/src/models/gpt.js index 142a766..ddf2d95 100644 --- a/src/models/gpt.js +++ b/src/models/gpt.js @@ -98,8 +98,6 @@ const sendAudioRequest = async (text, model, voice, url) => { input: text } - let audioData = null; - let config = {}; if (url)