From 33183df327764eaa7ec82df41301960016f417af Mon Sep 17 00:00:00 2001
From: Sweaterdog <Sweaterdog5475@gmail.com>
Date: Fri, 14 Mar 2025 13:46:33 -0700
Subject: [PATCH] Update tts_process.js

Added more logic to prevent multiple API requests, gibberish being sent, as well as made it easier to talk to the agent.
---
 src/process/tts_process.js | 171 +++++++++++++++++++++++++++----------
 1 file changed, 125 insertions(+), 46 deletions(-)

diff --git a/src/process/tts_process.js b/src/process/tts_process.js
index 977d783..5d20259 100644
--- a/src/process/tts_process.js
+++ b/src/process/tts_process.js
@@ -26,56 +26,72 @@ for (const file of leftover) {
 }
 
 // Configuration
-const RMS_THRESHOLD = 500;          // Lower threshold for faint audio
-const SILENCE_DURATION = 2000;      // 2 seconds of silence after speech => stop
+const RMS_THRESHOLD = 500;     // Lower threshold for faint audio
+const SILENCE_DURATION = 2000; // 2 seconds of silence after speech => stop
 const SAMPLE_RATE = 16000;
 const BIT_DEPTH = 16;
-const STT_USERNAME = settings.stt_username || "SERVER";        // Name that appears as sender
-const STT_AGENT_NAME = settings.stt_agent_name || "";          // If blank, broadcast to all
+const STT_USERNAME = settings.stt_username || "SERVER"; // Name that appears as sender
+const STT_AGENT_NAME = settings.stt_agent_name || "";   // If blank, broadcast to all
+
+// Guards to prevent multiple overlapping recordings
+let isRecording = false;  // Ensures only one recordAndTranscribeOnce at a time
+let sttRunning = false;   // Ensures continuousLoop is started only once
 
 /**
  * Records one session, transcribes, and sends to MindServer as a chat message
  */
-function recordAndTranscribeOnce() {
-  return new Promise((resolve, reject) => {
-    const outFile = path.join(__dirname, `speech_${Date.now()}.wav`);
-    const fileWriter = new wav.FileWriter(outFile, {
-      channels: 1,
+async function recordAndTranscribeOnce() {
+  // If another recording is in progress, just skip
+  if (isRecording) {
+    console.log("Another recording is still in progress; skipping new record attempt.");
+    return null;
+  }
+  isRecording = true;
+
+  const outFile = path.join(__dirname, `speech_${Date.now()}.wav`);
+  const fileWriter = new wav.FileWriter(outFile, {
+    channels: 1,
+    sampleRate: SAMPLE_RATE,
+    bitDepth: BIT_DEPTH
+  });
+  const ai = new AudioIO({
+    inOptions: {
+      channelCount: 1,
+      sampleFormat: SampleFormat16Bit,
       sampleRate: SAMPLE_RATE,
-      bitDepth: BIT_DEPTH
-    });
-    const ai = new AudioIO({
-      inOptions: {
-        channelCount: 1,
-        sampleFormat: SampleFormat16Bit,
-        sampleRate: SAMPLE_RATE,
-        deviceId: -1,
-        closeOnError: true
-      }
-    });
-
-    let recording = true;
-    let hasHeardSpeech = false;
-    let silenceTimer = null;
-
-    function resetSilenceTimer() {
-      if (silenceTimer) clearTimeout(silenceTimer);
-      if (hasHeardSpeech) {
-        silenceTimer = setTimeout(() => stopRecording(), SILENCE_DURATION);
-      }
+      deviceId: -1,
+      closeOnError: true
     }
+  });
 
-    function stopRecording() {
-      if (!recording) return;
-      recording = false;
-      ai.quit();
-      fileWriter.end();
+  let recording = true;
+  let hasHeardSpeech = false;
+  let silenceTimer = null;
+  let finished = false; // Guard to ensure final processing is done only once
+
+  // Helper to reset silence timer
+  function resetSilenceTimer() {
+    if (silenceTimer) clearTimeout(silenceTimer);
+    if (hasHeardSpeech) {
+      silenceTimer = setTimeout(() => stopRecording(), SILENCE_DURATION);
     }
+  }
 
+  // Stop recording
+  function stopRecording() {
+    if (!recording) return;
+    recording = false;
+    ai.quit();
+    fileWriter.end();
+  }
+
+  // We wrap everything in a promise so we can await the transcription
+  return new Promise((resolve, reject) => {
+    // Attach event handlers
     ai.on('data', (chunk) => {
       fileWriter.write(chunk);
 
-      // Calculate RMS
+      // Calculate RMS for threshold detection
       let sumSquares = 0;
       const sampleCount = chunk.length / 2;
       for (let i = 0; i < chunk.length; i += 2) {
@@ -84,6 +100,7 @@ function recordAndTranscribeOnce() {
       }
       const rms = Math.sqrt(sumSquares / sampleCount);
 
+      // If RMS passes threshold, we've heard speech
       if (rms > RMS_THRESHOLD) {
         if (!hasHeardSpeech) {
           hasHeardSpeech = true;
@@ -93,12 +110,27 @@ function recordAndTranscribeOnce() {
     });
 
     ai.on('error', (err) => {
+      cleanupListeners();
       reject(err);
     });
 
-    // Once the WAV file is finalized, transcribe
     fileWriter.on('finish', async () => {
+      if (finished) return;
+      finished = true;
       try {
+        // Check audio duration
+        const stats = fs.statSync(outFile);
+        const headerSize = 44; // standard WAV header size
+        const dataSize = stats.size - headerSize;
+        const duration = dataSize / (SAMPLE_RATE * (BIT_DEPTH / 8));
+        if (duration < 2.75) {
+          console.log("Audio too short (<2.75s); discarding.");
+          fs.unlink(outFile, () => {});
+          cleanupListeners();
+          return resolve(null);
+        }
+
+        // Transcribe
         const groqTTS = new GroqCloudTTS();
         const text = await groqTTS.transcribe(outFile, {
           model: "distil-whisper-large-v3-en",
@@ -108,17 +140,45 @@ function recordAndTranscribeOnce() {
           temperature: 0.0
         });
 
-        fs.unlink(outFile, () => {}); // Clean up wav file
+        fs.unlink(outFile, () => {}); // cleanup WAV file
 
-        // If Whisper returned nothing or just whitespace, discard
+        // Basic check for empty or whitespace
         if (!text || !text.trim()) {
-          console.log("Transcription empty, discarding.");
+          console.log("Transcription empty; discarding.");
+          cleanupListeners();
+          return resolve(null);
+        }
+
+        // Heuristic checks to determine if the transcription is genuine
+        
+        // 1. Ensure at least one alphabetical character
+        if (!/[A-Za-z]/.test(text)) {
+          console.log("Transcription has no letters; discarding.");
+          cleanupListeners();
+          return resolve(null);
+        }
+
+        // 2. Check for gibberish repeated sequences
+        if (/([A-Za-z])\1{3,}/.test(text)) {
+          console.log("Transcription looks like gibberish; discarding.");
+          cleanupListeners();
+          return resolve(null);
+        }
+
+        // 3. Check transcription length, with allowed greetings
+        const letterCount = text.replace(/[^A-Za-z]/g, "").length;
+        const normalizedText = text.trim().toLowerCase();
+        const allowedGreetings = new Set(["hi", "hello", "greetings", "hey"]);
+
+        if (letterCount < 8 && !allowedGreetings.has(normalizedText)) {
+          console.log("Transcription too short and not an allowed greeting; discarding.");
+          cleanupListeners();
           return resolve(null);
         }
 
         console.log("Transcription:", text);
 
-        // Format message so it looks like: "[SERVER] hello there"
+        // Format message so it looks like: "[SERVER] message"
         const finalMessage = `[${STT_USERNAME}] ${text}`;
 
         // If STT_AGENT_NAME is empty, broadcast to all agents
@@ -132,18 +192,30 @@ function recordAndTranscribeOnce() {
           getIO().emit('send-message', STT_AGENT_NAME, finalMessage);
         }
 
+        cleanupListeners();
         resolve(text);
       } catch (err) {
+        cleanupListeners();
         reject(err);
       }
     });
 
     ai.start();
+
+    function cleanupListeners() {
+      ai.removeAllListeners('data');
+      ai.removeAllListeners('error');
+      fileWriter.removeAllListeners('finish');
+      if (silenceTimer) clearTimeout(silenceTimer);
+
+      // release lock
+      isRecording = false;
+    }
   });
 }
 
 /**
- * Runs recording sessions sequentially so only one at a time
+ * Runs recording sessions sequentially, so only one at a time
  */
 async function continuousLoop() {
   while (true) {
@@ -157,12 +229,19 @@ async function continuousLoop() {
   }
 }
 
-/**
- * Initialize STT if enabled
- */
 export function initTTS() {
+  // Only run if stt_transcription is true and we haven't started already
   if (!settings.stt_transcription) return;
-  continuousLoop().catch(() => {});
+
+  if (sttRunning) {
+    console.log("STT loop already running; skipping re-init.");
+    return;
+  }
+  sttRunning = true;
+
+  continuousLoop().catch((err) => {
+    console.error("[STT] continuousLoop crashed", err);
+  });
 }
 
 initTTS();