From 33183df327764eaa7ec82df41301960016f417af Mon Sep 17 00:00:00 2001 From: Sweaterdog Date: Fri, 14 Mar 2025 13:46:33 -0700 Subject: [PATCH] Update tts_process.js Added more logic to prevent multiple API requests, gibberish being sent, as well as made it easier to talk to the agent. --- src/process/tts_process.js | 171 +++++++++++++++++++++++++++---------- 1 file changed, 125 insertions(+), 46 deletions(-) diff --git a/src/process/tts_process.js b/src/process/tts_process.js index 977d783..5d20259 100644 --- a/src/process/tts_process.js +++ b/src/process/tts_process.js @@ -26,56 +26,72 @@ for (const file of leftover) { } // Configuration -const RMS_THRESHOLD = 500; // Lower threshold for faint audio -const SILENCE_DURATION = 2000; // 2 seconds of silence after speech => stop +const RMS_THRESHOLD = 500; // Lower threshold for faint audio +const SILENCE_DURATION = 2000; // 2 seconds of silence after speech => stop const SAMPLE_RATE = 16000; const BIT_DEPTH = 16; -const STT_USERNAME = settings.stt_username || "SERVER"; // Name that appears as sender -const STT_AGENT_NAME = settings.stt_agent_name || ""; // If blank, broadcast to all +const STT_USERNAME = settings.stt_username || "SERVER"; // Name that appears as sender +const STT_AGENT_NAME = settings.stt_agent_name || ""; // If blank, broadcast to all + +// Guards to prevent multiple overlapping recordings +let isRecording = false; // Ensures only one recordAndTranscribeOnce at a time +let sttRunning = false; // Ensures continuousLoop is started only once /** * Records one session, transcribes, and sends to MindServer as a chat message */ -function recordAndTranscribeOnce() { - return new Promise((resolve, reject) => { - const outFile = path.join(__dirname, `speech_${Date.now()}.wav`); - const fileWriter = new wav.FileWriter(outFile, { - channels: 1, +async function recordAndTranscribeOnce() { + // If another recording is in progress, just skip + if (isRecording) { + console.log("Another recording is still in progress; skipping new record attempt."); + return null; + } + isRecording = true; + + const outFile = path.join(__dirname, `speech_${Date.now()}.wav`); + const fileWriter = new wav.FileWriter(outFile, { + channels: 1, + sampleRate: SAMPLE_RATE, + bitDepth: BIT_DEPTH + }); + const ai = new AudioIO({ + inOptions: { + channelCount: 1, + sampleFormat: SampleFormat16Bit, sampleRate: SAMPLE_RATE, - bitDepth: BIT_DEPTH - }); - const ai = new AudioIO({ - inOptions: { - channelCount: 1, - sampleFormat: SampleFormat16Bit, - sampleRate: SAMPLE_RATE, - deviceId: -1, - closeOnError: true - } - }); - - let recording = true; - let hasHeardSpeech = false; - let silenceTimer = null; - - function resetSilenceTimer() { - if (silenceTimer) clearTimeout(silenceTimer); - if (hasHeardSpeech) { - silenceTimer = setTimeout(() => stopRecording(), SILENCE_DURATION); - } + deviceId: -1, + closeOnError: true } + }); - function stopRecording() { - if (!recording) return; - recording = false; - ai.quit(); - fileWriter.end(); + let recording = true; + let hasHeardSpeech = false; + let silenceTimer = null; + let finished = false; // Guard to ensure final processing is done only once + + // Helper to reset silence timer + function resetSilenceTimer() { + if (silenceTimer) clearTimeout(silenceTimer); + if (hasHeardSpeech) { + silenceTimer = setTimeout(() => stopRecording(), SILENCE_DURATION); } + } + // Stop recording + function stopRecording() { + if (!recording) return; + recording = false; + ai.quit(); + fileWriter.end(); + } + + // We wrap everything in a promise so we can await the transcription + return new Promise((resolve, reject) => { + // Attach event handlers ai.on('data', (chunk) => { fileWriter.write(chunk); - // Calculate RMS + // Calculate RMS for threshold detection let sumSquares = 0; const sampleCount = chunk.length / 2; for (let i = 0; i < chunk.length; i += 2) { @@ -84,6 +100,7 @@ function recordAndTranscribeOnce() { } const rms = Math.sqrt(sumSquares / sampleCount); + // If RMS passes threshold, we've heard speech if (rms > RMS_THRESHOLD) { if (!hasHeardSpeech) { hasHeardSpeech = true; @@ -93,12 +110,27 @@ function recordAndTranscribeOnce() { }); ai.on('error', (err) => { + cleanupListeners(); reject(err); }); - // Once the WAV file is finalized, transcribe fileWriter.on('finish', async () => { + if (finished) return; + finished = true; try { + // Check audio duration + const stats = fs.statSync(outFile); + const headerSize = 44; // standard WAV header size + const dataSize = stats.size - headerSize; + const duration = dataSize / (SAMPLE_RATE * (BIT_DEPTH / 8)); + if (duration < 2.75) { + console.log("Audio too short (<2.75s); discarding."); + fs.unlink(outFile, () => {}); + cleanupListeners(); + return resolve(null); + } + + // Transcribe const groqTTS = new GroqCloudTTS(); const text = await groqTTS.transcribe(outFile, { model: "distil-whisper-large-v3-en", @@ -108,17 +140,45 @@ function recordAndTranscribeOnce() { temperature: 0.0 }); - fs.unlink(outFile, () => {}); // Clean up wav file + fs.unlink(outFile, () => {}); // cleanup WAV file - // If Whisper returned nothing or just whitespace, discard + // Basic check for empty or whitespace if (!text || !text.trim()) { - console.log("Transcription empty, discarding."); + console.log("Transcription empty; discarding."); + cleanupListeners(); + return resolve(null); + } + + // Heuristic checks to determine if the transcription is genuine + + // 1. Ensure at least one alphabetical character + if (!/[A-Za-z]/.test(text)) { + console.log("Transcription has no letters; discarding."); + cleanupListeners(); + return resolve(null); + } + + // 2. Check for gibberish repeated sequences + if (/([A-Za-z])\1{3,}/.test(text)) { + console.log("Transcription looks like gibberish; discarding."); + cleanupListeners(); + return resolve(null); + } + + // 3. Check transcription length, with allowed greetings + const letterCount = text.replace(/[^A-Za-z]/g, "").length; + const normalizedText = text.trim().toLowerCase(); + const allowedGreetings = new Set(["hi", "hello", "greetings", "hey"]); + + if (letterCount < 8 && !allowedGreetings.has(normalizedText)) { + console.log("Transcription too short and not an allowed greeting; discarding."); + cleanupListeners(); return resolve(null); } console.log("Transcription:", text); - // Format message so it looks like: "[SERVER] hello there" + // Format message so it looks like: "[SERVER] message" const finalMessage = `[${STT_USERNAME}] ${text}`; // If STT_AGENT_NAME is empty, broadcast to all agents @@ -132,18 +192,30 @@ function recordAndTranscribeOnce() { getIO().emit('send-message', STT_AGENT_NAME, finalMessage); } + cleanupListeners(); resolve(text); } catch (err) { + cleanupListeners(); reject(err); } }); ai.start(); + + function cleanupListeners() { + ai.removeAllListeners('data'); + ai.removeAllListeners('error'); + fileWriter.removeAllListeners('finish'); + if (silenceTimer) clearTimeout(silenceTimer); + + // release lock + isRecording = false; + } }); } /** - * Runs recording sessions sequentially so only one at a time + * Runs recording sessions sequentially, so only one at a time */ async function continuousLoop() { while (true) { @@ -157,12 +229,19 @@ async function continuousLoop() { } } -/** - * Initialize STT if enabled - */ export function initTTS() { + // Only run if stt_transcription is true and we haven't started already if (!settings.stt_transcription) return; - continuousLoop().catch(() => {}); + + if (sttRunning) { + console.log("STT loop already running; skipping re-init."); + return; + } + sttRunning = true; + + continuousLoop().catch((err) => { + console.error("[STT] continuousLoop crashed", err); + }); } initTTS();