mirror of
https://github.com/kolbytn/mindcraft.git
synced 2025-08-31 11:23:08 +02:00
working on windows, better queuing, spacing formatting, say->speak
This commit is contained in:
parent
d489cae49d
commit
684dcb701a
3 changed files with 139 additions and 98 deletions
|
@ -15,7 +15,7 @@ import { addBrowserViewer } from './vision/browser_viewer.js';
|
|||
import { serverProxy, sendOutputToServer } from './mindserver_proxy.js';
|
||||
import settings from './settings.js';
|
||||
import { Task } from './tasks/tasks.js';
|
||||
import { say } from './speak.js';
|
||||
import { speak } from './speak.js';
|
||||
|
||||
export class Agent {
|
||||
async start(load_mem=false, init_message=null, count_id=0) {
|
||||
|
@ -384,7 +384,7 @@ export class Agent {
|
|||
}
|
||||
else {
|
||||
if (settings.speak) {
|
||||
say(to_translate, this.prompter.profile.speak_model);
|
||||
speak(to_translate, this.prompter.profile.speak_model);
|
||||
}
|
||||
if (settings.chat_ingame) {this.bot.chat(message);}
|
||||
sendOutputToServer(this.name, message);
|
||||
|
|
|
@ -1,26 +1,86 @@
|
|||
import { exec, spawn } from 'child_process';
|
||||
import { promises as fs } from 'fs';
|
||||
import os from 'os';
|
||||
import path from 'path';
|
||||
import { TTSConfig as gptTTSConfig } from '../models/gpt.js';
|
||||
import { TTSConfig as geminiTTSConfig } from '../models/gemini.js';
|
||||
|
||||
let speakingQueue = [];
|
||||
let speakingQueue = []; // each item: {text, model, audioData, ready}
|
||||
let isSpeaking = false;
|
||||
|
||||
export function say(text, speak_model) {
|
||||
speakingQueue.push([text, speak_model]);
|
||||
export function speak(text, speak_model) {
|
||||
const model = speak_model || 'system';
|
||||
|
||||
const item = { text, model, audioData: null, ready: null };
|
||||
|
||||
if (model === 'system') {
|
||||
// no preprocessing needed
|
||||
item.ready = Promise.resolve();
|
||||
} else {
|
||||
item.ready = fetchRemoteAudio(text, model)
|
||||
.then(data => { item.audioData = data; })
|
||||
.catch(err => { item.error = err; });
|
||||
}
|
||||
|
||||
speakingQueue.push(item);
|
||||
if (!isSpeaking) processQueue();
|
||||
}
|
||||
|
||||
async function fetchRemoteAudio(txt, model) {
|
||||
function getModelUrl(prov) {
|
||||
if (prov === 'openai') return gptTTSConfig.baseUrl;
|
||||
if (prov === 'google') return geminiTTSConfig.baseUrl;
|
||||
return 'https://api.openai.com/v1';
|
||||
}
|
||||
|
||||
let prov, mdl, voice, url;
|
||||
if (typeof model === 'string') {
|
||||
[prov, mdl, voice] = model.split('/');
|
||||
url = getModelUrl(prov);
|
||||
} else {
|
||||
prov = model.api;
|
||||
mdl = model.model;
|
||||
voice = model.voice;
|
||||
url = model.url || getModelUrl(prov);
|
||||
}
|
||||
|
||||
if (prov === 'openai') {
|
||||
return gptTTSConfig.sendAudioRequest(txt, mdl, voice, url);
|
||||
} else if (prov === 'google') {
|
||||
return geminiTTSConfig.sendAudioRequest(txt, mdl, voice, url);
|
||||
}
|
||||
else {
|
||||
throw new Error(`TTS Provider ${prov} is not supported.`);
|
||||
}
|
||||
}
|
||||
|
||||
async function processQueue() {
|
||||
isSpeaking = true;
|
||||
if (speakingQueue.length === 0) {
|
||||
isSpeaking = false;
|
||||
return;
|
||||
}
|
||||
isSpeaking = true;
|
||||
const [txt, speak_model] = speakingQueue.shift();
|
||||
const item = speakingQueue.shift();
|
||||
const { text: txt, model, audioData } = item;
|
||||
if (txt.trim() === '') {
|
||||
isSpeaking = false;
|
||||
processQueue();
|
||||
return;
|
||||
}
|
||||
|
||||
const isWin = process.platform === 'win32';
|
||||
const isMac = process.platform === 'darwin';
|
||||
const model = speak_model || 'openai/tts-1/echo';
|
||||
|
||||
// wait for preprocessing if needed
|
||||
try {
|
||||
await item.ready;
|
||||
if (item.error) throw item.error;
|
||||
} catch (err) {
|
||||
console.error('[TTS] preprocess error', err);
|
||||
isSpeaking = false;
|
||||
processQueue();
|
||||
return;
|
||||
}
|
||||
|
||||
if (model === 'system') {
|
||||
// system TTS
|
||||
|
@ -34,61 +94,41 @@ $s.Speak('${txt.replace(/'/g,"''")}'); $s.Dispose()"`
|
|||
|
||||
exec(cmd, err => {
|
||||
if (err) console.error('TTS error', err);
|
||||
isSpeaking = false;
|
||||
processQueue();
|
||||
});
|
||||
|
||||
} else {
|
||||
|
||||
function getModelUrl(prov) {
|
||||
if (prov === 'openai') {
|
||||
return gptTTSConfig.baseUrl;
|
||||
} else if (prov === 'google') {
|
||||
return geminiTTSConfig.baseUrl;
|
||||
} else {
|
||||
// fallback
|
||||
return 'https://api.openai.com/v1'
|
||||
}
|
||||
}
|
||||
else {
|
||||
// audioData was already fetched in speak()
|
||||
const audioData = item.audioData;
|
||||
|
||||
// remote audio provider
|
||||
let prov, mdl, voice, url;
|
||||
if (typeof model === "string") {
|
||||
[prov, mdl, voice] = model.split('/');
|
||||
url = getModelUrl(prov);
|
||||
} else {
|
||||
prov = model.api;
|
||||
mdl = model.model;
|
||||
voice = model.voice;
|
||||
url = model.url || getModelUrl(prov);
|
||||
if (!audioData) {
|
||||
console.error('[TTS] No audio data ready');
|
||||
isSpeaking = false;
|
||||
processQueue();
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
let audioData;
|
||||
if (prov === "openai") {
|
||||
audioData = await gptTTSConfig.sendAudioRequest(txt, mdl, voice, url);
|
||||
} else if (prov === "google") {
|
||||
audioData = await geminiTTSConfig.sendAudioRequest(txt, mdl, voice, url);
|
||||
} else {
|
||||
throw new Error(`TTS Provider ${prov} is not supported.`);
|
||||
}
|
||||
|
||||
if (!audioData) {
|
||||
throw new Error("TTS model did not return audio data");
|
||||
// will be handled below
|
||||
}
|
||||
|
||||
if (isWin) {
|
||||
const ps = `
|
||||
Add-Type -AssemblyName presentationCore;
|
||||
$p=New-Object System.Windows.Media.MediaPlayer;
|
||||
$p.Open([Uri]::new("data:audio/mp3;base64,${audioData}"));
|
||||
$p.Play();
|
||||
Start-Sleep -Seconds [math]::Ceiling($p.NaturalDuration.TimeSpan.TotalSeconds);
|
||||
`;
|
||||
spawn('powershell', ['-NoProfile','-Command', ps], {
|
||||
stdio: 'ignore', detached: true
|
||||
}).unref();
|
||||
const tmpPath = path.join(os.tmpdir(), `tts_${Date.now()}.mp3`);
|
||||
await fs.writeFile(tmpPath, Buffer.from(audioData, 'base64'));
|
||||
|
||||
const player = spawn('ffplay', ['-nodisp', '-autoexit', '-loglevel', 'quiet', tmpPath], {
|
||||
stdio: 'ignore', windowsHide: true
|
||||
});
|
||||
player.on('error', async (err) => {
|
||||
console.error('[TTS] ffplay error', err);
|
||||
try { await fs.unlink(tmpPath); } catch {}
|
||||
isSpeaking = false;
|
||||
processQueue();
|
||||
});
|
||||
player.on('exit', async () => {
|
||||
try { await fs.unlink(tmpPath); } catch {}
|
||||
isSpeaking = false;
|
||||
processQueue();
|
||||
});
|
||||
|
||||
} else {
|
||||
const player = spawn('ffplay', ['-nodisp','-autoexit','pipe:0'], {
|
||||
|
@ -96,11 +136,14 @@ $s.Speak('${txt.replace(/'/g,"''")}'); $s.Dispose()"`
|
|||
});
|
||||
player.stdin.write(Buffer.from(audioData, 'base64'));
|
||||
player.stdin.end();
|
||||
player.on('exit', processQueue);
|
||||
player.on('exit', () => {
|
||||
isSpeaking = false;
|
||||
processQueue();
|
||||
});
|
||||
}
|
||||
|
||||
} catch (e) {
|
||||
console.error('[TTS] Audio error', e);
|
||||
isSpeaking = false;
|
||||
processQueue();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -98,8 +98,6 @@ const sendAudioRequest = async (text, model, voice, url) => {
|
|||
input: text
|
||||
}
|
||||
|
||||
let audioData = null;
|
||||
|
||||
let config = {};
|
||||
|
||||
if (url)
|
||||
|
|
Loading…
Add table
Reference in a new issue