Merge pull request #7 from Sweaterdog/Speech-to-Text

Speech to text
2025-07-26 09:55:26 +02:00 · 2025-06-07 14:59:42 -07:00 · 2025-06-07 14:59:42 -07:00 · 4efb5c304f
commit 4efb5c304f
parent e87e615f0c da0722a8fb
11 changed files with 335 additions and 33 deletions
--- a/README.md
+++ b/README.md
@ -120,6 +120,21 @@ When running in docker, if you want the bot to join your local minecraft server,

 To connect to an unsupported minecraft version, you can try to use [viaproxy](services/viaproxy/README.md)

+## STT in Mindcraft
+
+STT allows you to speak to the model if you have a microphone
+
+STT can be enabled in `settings.js` under the section that looks like this:
+```javascript
+    "stt_transcription": true, // Change this to "true" to enable STT
+    "stt_username": "SYSTEM",
+    "stt_agent_name": ""
+```
+
+The Text to Speech engine will begin listening on the system default input device.
+
+When using STT, you **need** a [GroqCloud API key](https://console.groq.com/keys) as Groq is used for Audio transcription
+
 # Bot Profiles

 Bot profiles are json files (such as `andy.json`) that define:
--- a/keys.example.json
+++ b/keys.example.json
@ -1,17 +1,17 @@
-{
-    "OPENAI_API_KEY": "",
-    "OPENAI_ORG_ID": "",
-    "GEMINI_API_KEY": "",
-    "ANTHROPIC_API_KEY": "",
-    "REPLICATE_API_KEY": "",
-    "GROQCLOUD_API_KEY": "",
-    "HUGGINGFACE_API_KEY": "",
-    "QWEN_API_KEY": "",
-    "XAI_API_KEY": "",
-    "MISTRAL_API_KEY": "",
-    "DEEPSEEK_API_KEY": "",
-    "GHLF_API_KEY": "",
-    "HYPERBOLIC_API_KEY": "",
-    "NOVITA_API_KEY": "",
-    "OPENROUTER_API_KEY": ""
-}
+{
+    "OPENAI_API_KEY": "",
+    "OPENAI_ORG_ID": "",
+    "GEMINI_API_KEY": "",
+    "ANTHROPIC_API_KEY": "",
+    "REPLICATE_API_KEY": "",
+    "GROQCLOUD_API_KEY": "",
+    "HUGGINGFACE_API_KEY": "",
+    "QWEN_API_KEY": "",
+    "XAI_API_KEY": "",
+    "MISTRAL_API_KEY": "",
+    "DEEPSEEK_API_KEY": "",
+    "GHLF_API_KEY": "",
+    "HYPERBOLIC_API_KEY": "",
+    "NOVITA_API_KEY": "",
+    "OPENROUTER_API_KEY": ""
+}
--- a/main.js
+++ b/main.js
@ -5,6 +5,7 @@ import { hideBin } from 'yargs/helpers';
 import { createMindServer } from './src/server/mind_server.js';
 import { mainProxy } from './src/process/main_proxy.js';
 import { readFileSync } from 'fs';
+import { initTTS } from './src/process/tts_process.js';

 function parseArguments() {
    return yargs(hideBin(process.argv))
@ -39,7 +40,7 @@ async function main() {
    const profiles = getProfiles(args);
    console.log(profiles);
    const { load_memory, init_message } = settings;
-
+    
    for (let i=0; i<profiles.length; i++) {
        const agent_process = new AgentProcess();
        const profile = readFileSync(profiles[i], 'utf8');
@ -48,6 +49,7 @@ async function main() {
        agent_process.start(profiles[i], load_memory, init_message, i, args.task_path, args.task_id);
        await new Promise(resolve => setTimeout(resolve, 1000));
    }
+    initTTS();
 }

 try {
--- a/package.json
+++ b/package.json
@ -9,7 +9,8 @@
        "cheerio": "^1.0.0",
        "express": "^4.18.2",
        "google-translate-api-x": "^10.7.1",
-        "groq-sdk": "^0.15.0",
+        "groq-sdk": "^0.5.0",
+        "mic": "^2.1.2",
        "minecraft-data": "^3.78.0",
        "mineflayer": "^4.26.0",
        "mineflayer-armor-manager": "^2.0.1",
@ -17,6 +18,7 @@
        "mineflayer-collectblock": "^1.4.1",
        "mineflayer-pathfinder": "^2.4.5",
        "mineflayer-pvp": "^1.3.2",
+        "naudiodon": "^2.3.6",
        "node-canvas-webgl": "PrismarineJS/node-canvas-webgl",
        "openai": "^4.4.0",
        "patch-package": "^8.0.0",
@ -28,6 +30,7 @@
        "socket.io-client": "^4.7.2",
        "three": "^0.128.0",
        "vec3": "^0.1.10",
+        "wav": "^1.0.2",
        "yargs": "^17.7.2"
    },
    "scripts": {
@ -40,4 +43,4 @@
        "eslint-plugin-no-floating-promise": "^2.0.0",
        "globals": "^15.11.0"
    }
-}
+}
--- a/patches/@google+generative-ai+0.2.1.patch
+++ b/patches/@google+generative-ai+0.2.1.patch
@ -1,13 +1,12 @@
 diff --git a/node_modules/@google/generative-ai/dist/index.mjs b/node_modules/@google/generative-ai/dist/index.mjs
-index 23a175b..aab7e19 100644
 --- a/node_modules/@google/generative-ai/dist/index.mjs
 +++ b/node_modules/@google/generative-ai/dist/index.mjs
-@@ -151,7 +151,7 @@ class GoogleGenerativeAIResponseError extends GoogleGenerativeAIError {
-  * limitations under the License.
-  */
- const BASE_URL = "https://generativelanguage.googleapis.com";
+@@ -156,1 +156,1 @@
+-const API_VERSION = "v1";
+const API_VERSION = "v1beta";
+diff --git a/node_modules/@google/generative-ai/dist/index.js b/node_modules/@google/generative-ai/dist/index.js
+--- a/node_modules/@google/generative-ai/dist/index.js
+++ b/node_modules/@google/generative-ai/dist/index.js
+@@ -156,1 +156,1 @@
 -const API_VERSION = "v1";
 +const API_VERSION = "v1beta";
- /**
-  * We can't `require` package.json if this runs on web. We will use rollup to
-  * swap in the version number here at build time.
--- a/profiles/llama.json
+++ b/profiles/llama.json
@ -7,4 +7,4 @@

    "embedding": "openai"
    
-}
+}
--- a/settings.js
+++ b/settings.js
@ -29,7 +29,6 @@ const settings = {
    "load_memory": false, // load memory from previous session
    "init_message": "Respond with hello world and your name", // sends to all on spawn
    "only_chat_with": [], // users that the bots listen to and send general messages to. if empty it will chat publicly
-    "speak": false, // allows all bots to speak through system text-to-speech. works on windows, mac, on linux you need to `apt install espeak`
    "language": "en", // translate to/from this language. Supports these language names: https://cloud.google.com/translate/docs/languages
    "show_bot_views": false, // show bot's view in browser at localhost:3000, 3001...

@ -46,10 +45,15 @@ const settings = {
    "verbose_commands": true, // show full command syntax
    "narrate_behavior": true, // chat simple automatic actions ('Picking up item!')
    "chat_bot_messages": true, // publicly chat messages to other bots
+
+    "stt_transcription": false, // change this to "true" or "false" depending on if you want STT in Mindcraft, STT needs a GroqCloud API key, can be found here: https://console.groq.com/keys
+    "stt_username": "SYSTEM", // Change this to the username the model will respond to.
+    "stt_agent_name": "" // Change the name here to whatever your agent is named, if left empty, will send message to all agents.
+    "speak": false, // allows all bots to speak through system text-to-speech. works on windows, mac, on linux you need to `apt install espeak`
    
-    "log_normal_data": false,
-    "log_reasoning_data": false,
-    "log_vision_data": false,
+    "log_normal_data": false, // Logs all inputs / outputs without reasoning or vision data
+    "log_reasoning_data": false, // Logs only reasoning inputs / outputs
+    "log_vision_data": false, // Logs only vision inputs / outputs
    
 }

--- a/src/agent/agent.js
+++ b/src/agent/agent.js
@ -20,6 +20,15 @@ import { say } from './speak.js';
 export class Agent {
    async start(profile_fp, load_mem=false, init_message=null, count_id=0, task_path=null, task_id=null) {
        this.last_sender = null;
+        // Safely attach agent instance to a global-like object so STT code can access it.
+        // This works in Node.js ESM or CommonJS. If "global" doesn't exist, fallback to "globalThis".
+        const globalObj = (typeof global !== 'undefined') ? global : globalThis;
+        try {
+            globalObj.agent = this;
+        } catch(e) {
+            console.warn("Failed attaching agent to global object:", e);
+        }
+        
        this.latestScreenshotPath = null;
        this.count_id = count_id;
        if (!profile_fp) {
@ -126,6 +135,7 @@ export class Agent {
        });
    }

+
    async _setupEventHandlers(save_data, init_message) {
        const ignore_messages = [
            "Set own game mode to",
--- a/src/models/groq.js
+++ b/src/models/groq.js
@ -1,4 +1,5 @@
 import Groq from 'groq-sdk'
+import fs from "fs";
 import { getKey } from '../utils/keys.js';
 import { log, logVision } from '../../logger.js';

@ -104,3 +105,21 @@ export class GroqCloudAPI {
        throw new Error('Embeddings are not supported by Groq.');
    }
 }
+
+export class GroqCloudTTS {
+  constructor() {
+    this.groq = new Groq({ apiKey: getKey('GROQCLOUD_API_KEY') });
+  }
+
+  async transcribe(filePath, options = {}) {
+    const transcription = await this.groq.audio.transcriptions.create({
+      file: fs.createReadStream(filePath),
+      model: options.model || "distil-whisper-large-v3-en", // or "whisper-large-v3-turbo"
+      prompt: options.prompt || "",
+      response_format: options.response_format || "json",
+      language: options.language || "en",
+      temperature: options.temperature !== undefined ? options.temperature : 0.0,
+    });
+    return transcription.text;
+  }
+}
--- a/src/process/tts_process.js
+++ b/src/process/tts_process.js
@ -0,0 +1,247 @@
+import settings from '../../settings.js';
+import { GroqCloudTTS } from '../models/groq.js';
+import portAudio from 'naudiodon';
+const { AudioIO, SampleFormat16Bit } = portAudio;
+import wav from 'wav';
+import fs from 'fs';
+import path from 'path';
+import { fileURLToPath } from 'url';
+
+// Import getIO and our new function getAllInGameAgentNames
+import { getIO, getAllInGameAgentNames } from '../server/mind_server.js';
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+
+/**
+ * Delete leftover speech_*.wav from previous runs
+ */
+const leftover = fs.readdirSync(__dirname).filter(f => /^speech_\d+\.wav$/.test(f));
+for (const file of leftover) {
+  try {
+    fs.unlinkSync(path.join(__dirname, file));
+  } catch (_) {
+    // ignore errors
+  }
+}
+
+// Configuration
+const RMS_THRESHOLD = 500;     // Lower threshold for faint audio
+const SILENCE_DURATION = 2000; // 2 seconds of silence after speech => stop
+const SAMPLE_RATE = 16000;
+const BIT_DEPTH = 16;
+const STT_USERNAME = settings.stt_username || "SERVER"; // Name that appears as sender
+const STT_AGENT_NAME = settings.stt_agent_name || "";   // If blank, broadcast to all
+
+// Guards to prevent multiple overlapping recordings
+let isRecording = false;  // Ensures only one recordAndTranscribeOnce at a time
+let sttRunning = false;   // Ensures continuousLoop is started only once
+
+/**
+ * Records one session, transcribes, and sends to MindServer as a chat message
+ */
+async function recordAndTranscribeOnce() {
+  // If another recording is in progress, just skip
+  if (isRecording) {
+    console.log("Another recording is still in progress; skipping new record attempt.");
+    return null;
+  }
+  isRecording = true;
+
+  const outFile = path.join(__dirname, `speech_${Date.now()}.wav`);
+  const fileWriter = new wav.FileWriter(outFile, {
+    channels: 1,
+    sampleRate: SAMPLE_RATE,
+    bitDepth: BIT_DEPTH
+  });
+  const ai = new AudioIO({
+    inOptions: {
+      channelCount: 1,
+      sampleFormat: SampleFormat16Bit,
+      sampleRate: SAMPLE_RATE,
+      deviceId: -1,
+      closeOnError: true
+    }
+  });
+
+  let recording = true;
+  let hasHeardSpeech = false;
+  let silenceTimer = null;
+  let finished = false; // Guard to ensure final processing is done only once
+
+  // Helper to reset silence timer
+  function resetSilenceTimer() {
+    if (silenceTimer) clearTimeout(silenceTimer);
+    if (hasHeardSpeech) {
+      silenceTimer = setTimeout(() => stopRecording(), SILENCE_DURATION);
+    }
+  }
+
+  // Stop recording
+  function stopRecording() {
+    if (!recording) return;
+    recording = false;
+    ai.quit();
+    fileWriter.end();
+  }
+
+  // We wrap everything in a promise so we can await the transcription
+  return new Promise((resolve, reject) => {
+    // Attach event handlers
+    ai.on('data', (chunk) => {
+      fileWriter.write(chunk);
+
+      // Calculate RMS for threshold detection
+      let sumSquares = 0;
+      const sampleCount = chunk.length / 2;
+      for (let i = 0; i < chunk.length; i += 2) {
+        const sample = chunk.readInt16LE(i);
+        sumSquares += sample * sample;
+      }
+      const rms = Math.sqrt(sumSquares / sampleCount);
+
+      // If RMS passes threshold, we've heard speech
+      if (rms > RMS_THRESHOLD) {
+        if (!hasHeardSpeech) {
+          hasHeardSpeech = true;
+        }
+        resetSilenceTimer();
+      }
+    });
+
+    ai.on('error', (err) => {
+      cleanupListeners();
+      reject(err);
+    });
+
+    fileWriter.on('finish', async () => {
+      if (finished) return;
+      finished = true;
+      try {
+        // Check audio duration
+        const stats = fs.statSync(outFile);
+        const headerSize = 44; // standard WAV header size
+        const dataSize = stats.size - headerSize;
+        const duration = dataSize / (SAMPLE_RATE * (BIT_DEPTH / 8));
+        if (duration < 2.75) {
+          console.log("Audio too short (<2.75s); discarding.");
+          fs.unlink(outFile, () => {});
+          cleanupListeners();
+          return resolve(null);
+        }
+
+        // Transcribe
+        const groqTTS = new GroqCloudTTS();
+        const text = await groqTTS.transcribe(outFile, {
+          model: "distil-whisper-large-v3-en",
+          prompt: "",
+          response_format: "json",
+          language: "en",
+          temperature: 0.0
+        });
+
+        fs.unlink(outFile, () => {}); // cleanup WAV file
+
+        // Basic check for empty or whitespace
+        if (!text || !text.trim()) {
+          console.log("Transcription empty; discarding.");
+          cleanupListeners();
+          return resolve(null);
+        }
+
+        // Heuristic checks to determine if the transcription is genuine
+        
+        // 1. Ensure at least one alphabetical character
+        if (!/[A-Za-z]/.test(text)) {
+          console.log("Transcription has no letters; discarding.");
+          cleanupListeners();
+          return resolve(null);
+        }
+
+        // 2. Check for gibberish repeated sequences
+        if (/([A-Za-z])\1{3,}/.test(text)) {
+          console.log("Transcription looks like gibberish; discarding.");
+          cleanupListeners();
+          return resolve(null);
+        }
+
+        // 3. Check transcription length, with allowed greetings
+        const letterCount = text.replace(/[^A-Za-z]/g, "").length;
+        const normalizedText = text.trim().toLowerCase();
+        const allowedGreetings = new Set(["hi", "hello", "greetings", "hey"]);
+
+        if (letterCount < 8 && !allowedGreetings.has(normalizedText)) {
+          console.log("Transcription too short and not an allowed greeting; discarding.");
+          cleanupListeners();
+          return resolve(null);
+        }
+
+        console.log("Transcription:", text);
+
+        // Format message so it looks like: "[SERVER] message"
+        const finalMessage = `[${STT_USERNAME}] ${text}`;
+
+        // If STT_AGENT_NAME is empty, broadcast to all agents
+        if (!STT_AGENT_NAME.trim()) {
+          const agentNames = getAllInGameAgentNames(); // from mind_server
+          for (const agentName of agentNames) {
+            getIO().emit('send-message', agentName, finalMessage);
+          }
+        } else {
+          // Otherwise, send only to the specified agent
+          getIO().emit('send-message', STT_AGENT_NAME, finalMessage);
+        }
+
+        cleanupListeners();
+        resolve(text);
+      } catch (err) {
+        cleanupListeners();
+        reject(err);
+      }
+    });
+
+    ai.start();
+
+    function cleanupListeners() {
+      ai.removeAllListeners('data');
+      ai.removeAllListeners('error');
+      fileWriter.removeAllListeners('finish');
+      if (silenceTimer) clearTimeout(silenceTimer);
+
+      // release lock
+      isRecording = false;
+    }
+  });
+}
+
+/**
+ * Runs recording sessions sequentially, so only one at a time
+ */
+async function continuousLoop() {
+  while (true) {
+    try {
+      await recordAndTranscribeOnce();
+    } catch (err) {
+      console.error("[STT Error]", err);
+    }
+    // short gap
+    await new Promise(res => setTimeout(res, 1000));
+  }
+}
+
+export function initTTS() {
+  // Only run if stt_transcription is true and we haven't started already
+  if (!settings.stt_transcription) return;
+
+  if (sttRunning) {
+    console.log("STT loop already running; skipping re-init.");
+    return;
+  }
+  sttRunning = true;
+
+  continuousLoop().catch((err) => {
+    console.error("[STT] continuousLoop crashed", err);
+  });
+}
+
+initTTS();
--- a/src/server/mind_server.js
+++ b/src/server/mind_server.js
@ -161,3 +161,6 @@ function stopAllAgents() {
 export const getIO = () => io;
 export const getServer = () => server;
 export const getConnectedAgents = () => connectedAgents; 
+export function getAllInGameAgentNames() {
+    return Object.keys(inGameAgents);
+  }