working on windows, better queuing, spacing formatting, say->speak

This commit is contained in:
MaxRobinsonTheGreat 2025-08-27 15:04:45 -05:00
parent d489cae49d
commit 684dcb701a
3 changed files with 139 additions and 98 deletions

View file

@ -15,7 +15,7 @@ import { addBrowserViewer } from './vision/browser_viewer.js';
import { serverProxy, sendOutputToServer } from './mindserver_proxy.js';
import settings from './settings.js';
import { Task } from './tasks/tasks.js';
import { say } from './speak.js';
import { speak } from './speak.js';
export class Agent {
async start(load_mem=false, init_message=null, count_id=0) {
@ -384,7 +384,7 @@ export class Agent {
}
else {
if (settings.speak) {
say(to_translate, this.prompter.profile.speak_model);
speak(to_translate, this.prompter.profile.speak_model);
}
if (settings.chat_ingame) {this.bot.chat(message);}
sendOutputToServer(this.name, message);

View file

@ -1,26 +1,86 @@
import { exec, spawn } from 'child_process';
import { promises as fs } from 'fs';
import os from 'os';
import path from 'path';
import { TTSConfig as gptTTSConfig } from '../models/gpt.js';
import { TTSConfig as geminiTTSConfig } from '../models/gemini.js';
let speakingQueue = [];
let speakingQueue = []; // each item: {text, model, audioData, ready}
let isSpeaking = false;
export function say(text, speak_model) {
speakingQueue.push([text, speak_model]);
export function speak(text, speak_model) {
const model = speak_model || 'system';
const item = { text, model, audioData: null, ready: null };
if (model === 'system') {
// no preprocessing needed
item.ready = Promise.resolve();
} else {
item.ready = fetchRemoteAudio(text, model)
.then(data => { item.audioData = data; })
.catch(err => { item.error = err; });
}
speakingQueue.push(item);
if (!isSpeaking) processQueue();
}
async function fetchRemoteAudio(txt, model) {
function getModelUrl(prov) {
if (prov === 'openai') return gptTTSConfig.baseUrl;
if (prov === 'google') return geminiTTSConfig.baseUrl;
return 'https://api.openai.com/v1';
}
let prov, mdl, voice, url;
if (typeof model === 'string') {
[prov, mdl, voice] = model.split('/');
url = getModelUrl(prov);
} else {
prov = model.api;
mdl = model.model;
voice = model.voice;
url = model.url || getModelUrl(prov);
}
if (prov === 'openai') {
return gptTTSConfig.sendAudioRequest(txt, mdl, voice, url);
} else if (prov === 'google') {
return geminiTTSConfig.sendAudioRequest(txt, mdl, voice, url);
}
else {
throw new Error(`TTS Provider ${prov} is not supported.`);
}
}
async function processQueue() {
isSpeaking = true;
if (speakingQueue.length === 0) {
isSpeaking = false;
return;
}
isSpeaking = true;
const [txt, speak_model] = speakingQueue.shift();
const item = speakingQueue.shift();
const { text: txt, model, audioData } = item;
if (txt.trim() === '') {
isSpeaking = false;
processQueue();
return;
}
const isWin = process.platform === 'win32';
const isMac = process.platform === 'darwin';
const model = speak_model || 'openai/tts-1/echo';
// wait for preprocessing if needed
try {
await item.ready;
if (item.error) throw item.error;
} catch (err) {
console.error('[TTS] preprocess error', err);
isSpeaking = false;
processQueue();
return;
}
if (model === 'system') {
// system TTS
@ -34,61 +94,41 @@ $s.Speak('${txt.replace(/'/g,"''")}'); $s.Dispose()"`
exec(cmd, err => {
if (err) console.error('TTS error', err);
isSpeaking = false;
processQueue();
});
} else {
function getModelUrl(prov) {
if (prov === 'openai') {
return gptTTSConfig.baseUrl;
} else if (prov === 'google') {
return geminiTTSConfig.baseUrl;
} else {
// fallback
return 'https://api.openai.com/v1'
}
}
else {
// audioData was already fetched in speak()
const audioData = item.audioData;
// remote audio provider
let prov, mdl, voice, url;
if (typeof model === "string") {
[prov, mdl, voice] = model.split('/');
url = getModelUrl(prov);
} else {
prov = model.api;
mdl = model.model;
voice = model.voice;
url = model.url || getModelUrl(prov);
if (!audioData) {
console.error('[TTS] No audio data ready');
isSpeaking = false;
processQueue();
return;
}
try {
let audioData;
if (prov === "openai") {
audioData = await gptTTSConfig.sendAudioRequest(txt, mdl, voice, url);
} else if (prov === "google") {
audioData = await geminiTTSConfig.sendAudioRequest(txt, mdl, voice, url);
} else {
throw new Error(`TTS Provider ${prov} is not supported.`);
}
if (!audioData) {
throw new Error("TTS model did not return audio data");
// will be handled below
}
if (isWin) {
const ps = `
Add-Type -AssemblyName presentationCore;
$p=New-Object System.Windows.Media.MediaPlayer;
$p.Open([Uri]::new("data:audio/mp3;base64,${audioData}"));
$p.Play();
Start-Sleep -Seconds [math]::Ceiling($p.NaturalDuration.TimeSpan.TotalSeconds);
`;
spawn('powershell', ['-NoProfile','-Command', ps], {
stdio: 'ignore', detached: true
}).unref();
const tmpPath = path.join(os.tmpdir(), `tts_${Date.now()}.mp3`);
await fs.writeFile(tmpPath, Buffer.from(audioData, 'base64'));
const player = spawn('ffplay', ['-nodisp', '-autoexit', '-loglevel', 'quiet', tmpPath], {
stdio: 'ignore', windowsHide: true
});
player.on('error', async (err) => {
console.error('[TTS] ffplay error', err);
try { await fs.unlink(tmpPath); } catch {}
isSpeaking = false;
processQueue();
});
player.on('exit', async () => {
try { await fs.unlink(tmpPath); } catch {}
isSpeaking = false;
processQueue();
});
} else {
const player = spawn('ffplay', ['-nodisp','-autoexit','pipe:0'], {
@ -96,11 +136,14 @@ $s.Speak('${txt.replace(/'/g,"''")}'); $s.Dispose()"`
});
player.stdin.write(Buffer.from(audioData, 'base64'));
player.stdin.end();
player.on('exit', processQueue);
player.on('exit', () => {
isSpeaking = false;
processQueue();
});
}
} catch (e) {
console.error('[TTS] Audio error', e);
isSpeaking = false;
processQueue();
}
}

View file

@ -98,8 +98,6 @@ const sendAudioRequest = async (text, model, voice, url) => {
input: text
}
let audioData = null;
let config = {};
if (url)