mirror of
https://github.com/kolbytn/mindcraft.git
synced 2025-07-29 11:25:28 +02:00
Update tts_process.js
Added more logic to prevent multiple API requests, gibberish being sent, as well as made it easier to talk to the agent.
This commit is contained in:
parent
64b284c0f2
commit
33183df327
1 changed files with 125 additions and 46 deletions
|
@ -26,56 +26,72 @@ for (const file of leftover) {
|
|||
}
|
||||
|
||||
// Configuration
|
||||
const RMS_THRESHOLD = 500; // Lower threshold for faint audio
|
||||
const SILENCE_DURATION = 2000; // 2 seconds of silence after speech => stop
|
||||
const RMS_THRESHOLD = 500; // Lower threshold for faint audio
|
||||
const SILENCE_DURATION = 2000; // 2 seconds of silence after speech => stop
|
||||
const SAMPLE_RATE = 16000;
|
||||
const BIT_DEPTH = 16;
|
||||
const STT_USERNAME = settings.stt_username || "SERVER"; // Name that appears as sender
|
||||
const STT_AGENT_NAME = settings.stt_agent_name || ""; // If blank, broadcast to all
|
||||
const STT_USERNAME = settings.stt_username || "SERVER"; // Name that appears as sender
|
||||
const STT_AGENT_NAME = settings.stt_agent_name || ""; // If blank, broadcast to all
|
||||
|
||||
// Guards to prevent multiple overlapping recordings
|
||||
let isRecording = false; // Ensures only one recordAndTranscribeOnce at a time
|
||||
let sttRunning = false; // Ensures continuousLoop is started only once
|
||||
|
||||
/**
|
||||
* Records one session, transcribes, and sends to MindServer as a chat message
|
||||
*/
|
||||
function recordAndTranscribeOnce() {
|
||||
return new Promise((resolve, reject) => {
|
||||
const outFile = path.join(__dirname, `speech_${Date.now()}.wav`);
|
||||
const fileWriter = new wav.FileWriter(outFile, {
|
||||
channels: 1,
|
||||
async function recordAndTranscribeOnce() {
|
||||
// If another recording is in progress, just skip
|
||||
if (isRecording) {
|
||||
console.log("Another recording is still in progress; skipping new record attempt.");
|
||||
return null;
|
||||
}
|
||||
isRecording = true;
|
||||
|
||||
const outFile = path.join(__dirname, `speech_${Date.now()}.wav`);
|
||||
const fileWriter = new wav.FileWriter(outFile, {
|
||||
channels: 1,
|
||||
sampleRate: SAMPLE_RATE,
|
||||
bitDepth: BIT_DEPTH
|
||||
});
|
||||
const ai = new AudioIO({
|
||||
inOptions: {
|
||||
channelCount: 1,
|
||||
sampleFormat: SampleFormat16Bit,
|
||||
sampleRate: SAMPLE_RATE,
|
||||
bitDepth: BIT_DEPTH
|
||||
});
|
||||
const ai = new AudioIO({
|
||||
inOptions: {
|
||||
channelCount: 1,
|
||||
sampleFormat: SampleFormat16Bit,
|
||||
sampleRate: SAMPLE_RATE,
|
||||
deviceId: -1,
|
||||
closeOnError: true
|
||||
}
|
||||
});
|
||||
|
||||
let recording = true;
|
||||
let hasHeardSpeech = false;
|
||||
let silenceTimer = null;
|
||||
|
||||
function resetSilenceTimer() {
|
||||
if (silenceTimer) clearTimeout(silenceTimer);
|
||||
if (hasHeardSpeech) {
|
||||
silenceTimer = setTimeout(() => stopRecording(), SILENCE_DURATION);
|
||||
}
|
||||
deviceId: -1,
|
||||
closeOnError: true
|
||||
}
|
||||
});
|
||||
|
||||
function stopRecording() {
|
||||
if (!recording) return;
|
||||
recording = false;
|
||||
ai.quit();
|
||||
fileWriter.end();
|
||||
let recording = true;
|
||||
let hasHeardSpeech = false;
|
||||
let silenceTimer = null;
|
||||
let finished = false; // Guard to ensure final processing is done only once
|
||||
|
||||
// Helper to reset silence timer
|
||||
function resetSilenceTimer() {
|
||||
if (silenceTimer) clearTimeout(silenceTimer);
|
||||
if (hasHeardSpeech) {
|
||||
silenceTimer = setTimeout(() => stopRecording(), SILENCE_DURATION);
|
||||
}
|
||||
}
|
||||
|
||||
// Stop recording
|
||||
function stopRecording() {
|
||||
if (!recording) return;
|
||||
recording = false;
|
||||
ai.quit();
|
||||
fileWriter.end();
|
||||
}
|
||||
|
||||
// We wrap everything in a promise so we can await the transcription
|
||||
return new Promise((resolve, reject) => {
|
||||
// Attach event handlers
|
||||
ai.on('data', (chunk) => {
|
||||
fileWriter.write(chunk);
|
||||
|
||||
// Calculate RMS
|
||||
// Calculate RMS for threshold detection
|
||||
let sumSquares = 0;
|
||||
const sampleCount = chunk.length / 2;
|
||||
for (let i = 0; i < chunk.length; i += 2) {
|
||||
|
@ -84,6 +100,7 @@ function recordAndTranscribeOnce() {
|
|||
}
|
||||
const rms = Math.sqrt(sumSquares / sampleCount);
|
||||
|
||||
// If RMS passes threshold, we've heard speech
|
||||
if (rms > RMS_THRESHOLD) {
|
||||
if (!hasHeardSpeech) {
|
||||
hasHeardSpeech = true;
|
||||
|
@ -93,12 +110,27 @@ function recordAndTranscribeOnce() {
|
|||
});
|
||||
|
||||
ai.on('error', (err) => {
|
||||
cleanupListeners();
|
||||
reject(err);
|
||||
});
|
||||
|
||||
// Once the WAV file is finalized, transcribe
|
||||
fileWriter.on('finish', async () => {
|
||||
if (finished) return;
|
||||
finished = true;
|
||||
try {
|
||||
// Check audio duration
|
||||
const stats = fs.statSync(outFile);
|
||||
const headerSize = 44; // standard WAV header size
|
||||
const dataSize = stats.size - headerSize;
|
||||
const duration = dataSize / (SAMPLE_RATE * (BIT_DEPTH / 8));
|
||||
if (duration < 2.75) {
|
||||
console.log("Audio too short (<2.75s); discarding.");
|
||||
fs.unlink(outFile, () => {});
|
||||
cleanupListeners();
|
||||
return resolve(null);
|
||||
}
|
||||
|
||||
// Transcribe
|
||||
const groqTTS = new GroqCloudTTS();
|
||||
const text = await groqTTS.transcribe(outFile, {
|
||||
model: "distil-whisper-large-v3-en",
|
||||
|
@ -108,17 +140,45 @@ function recordAndTranscribeOnce() {
|
|||
temperature: 0.0
|
||||
});
|
||||
|
||||
fs.unlink(outFile, () => {}); // Clean up wav file
|
||||
fs.unlink(outFile, () => {}); // cleanup WAV file
|
||||
|
||||
// If Whisper returned nothing or just whitespace, discard
|
||||
// Basic check for empty or whitespace
|
||||
if (!text || !text.trim()) {
|
||||
console.log("Transcription empty, discarding.");
|
||||
console.log("Transcription empty; discarding.");
|
||||
cleanupListeners();
|
||||
return resolve(null);
|
||||
}
|
||||
|
||||
// Heuristic checks to determine if the transcription is genuine
|
||||
|
||||
// 1. Ensure at least one alphabetical character
|
||||
if (!/[A-Za-z]/.test(text)) {
|
||||
console.log("Transcription has no letters; discarding.");
|
||||
cleanupListeners();
|
||||
return resolve(null);
|
||||
}
|
||||
|
||||
// 2. Check for gibberish repeated sequences
|
||||
if (/([A-Za-z])\1{3,}/.test(text)) {
|
||||
console.log("Transcription looks like gibberish; discarding.");
|
||||
cleanupListeners();
|
||||
return resolve(null);
|
||||
}
|
||||
|
||||
// 3. Check transcription length, with allowed greetings
|
||||
const letterCount = text.replace(/[^A-Za-z]/g, "").length;
|
||||
const normalizedText = text.trim().toLowerCase();
|
||||
const allowedGreetings = new Set(["hi", "hello", "greetings", "hey"]);
|
||||
|
||||
if (letterCount < 8 && !allowedGreetings.has(normalizedText)) {
|
||||
console.log("Transcription too short and not an allowed greeting; discarding.");
|
||||
cleanupListeners();
|
||||
return resolve(null);
|
||||
}
|
||||
|
||||
console.log("Transcription:", text);
|
||||
|
||||
// Format message so it looks like: "[SERVER] hello there"
|
||||
// Format message so it looks like: "[SERVER] message"
|
||||
const finalMessage = `[${STT_USERNAME}] ${text}`;
|
||||
|
||||
// If STT_AGENT_NAME is empty, broadcast to all agents
|
||||
|
@ -132,18 +192,30 @@ function recordAndTranscribeOnce() {
|
|||
getIO().emit('send-message', STT_AGENT_NAME, finalMessage);
|
||||
}
|
||||
|
||||
cleanupListeners();
|
||||
resolve(text);
|
||||
} catch (err) {
|
||||
cleanupListeners();
|
||||
reject(err);
|
||||
}
|
||||
});
|
||||
|
||||
ai.start();
|
||||
|
||||
function cleanupListeners() {
|
||||
ai.removeAllListeners('data');
|
||||
ai.removeAllListeners('error');
|
||||
fileWriter.removeAllListeners('finish');
|
||||
if (silenceTimer) clearTimeout(silenceTimer);
|
||||
|
||||
// release lock
|
||||
isRecording = false;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Runs recording sessions sequentially so only one at a time
|
||||
* Runs recording sessions sequentially, so only one at a time
|
||||
*/
|
||||
async function continuousLoop() {
|
||||
while (true) {
|
||||
|
@ -157,12 +229,19 @@ async function continuousLoop() {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize STT if enabled
|
||||
*/
|
||||
export function initTTS() {
|
||||
// Only run if stt_transcription is true and we haven't started already
|
||||
if (!settings.stt_transcription) return;
|
||||
continuousLoop().catch(() => {});
|
||||
|
||||
if (sttRunning) {
|
||||
console.log("STT loop already running; skipping re-init.");
|
||||
return;
|
||||
}
|
||||
sttRunning = true;
|
||||
|
||||
continuousLoop().catch((err) => {
|
||||
console.error("[STT] continuousLoop crashed", err);
|
||||
});
|
||||
}
|
||||
|
||||
initTTS();
|
||||
|
|
Loading…
Add table
Reference in a new issue