diff --git a/README.md b/README.md index aa3945e..7f422ff 100644 --- a/README.md +++ b/README.md @@ -120,6 +120,21 @@ When running in docker, if you want the bot to join your local minecraft server, To connect to an unsupported minecraft version, you can try to use [viaproxy](services/viaproxy/README.md) +## STT in Mindcraft + +STT allows you to speak to the model if you have a microphone + +STT can be enabled in `settings.js` under the section that looks like this: +```javascript + "stt_transcription": true, // Change this to "true" to enable STT + "stt_username": "SYSTEM", + "stt_agent_name": "" +``` + +The Text to Speech engine will begin listening on the system default input device. + +When using STT, you **need** a [GroqCloud API key](https://console.groq.com/keys) as Groq is used for Audio transcription + # Bot Profiles Bot profiles are json files (such as `andy.json`) that define: diff --git a/keys.example.json b/keys.example.json index 99286c5..d9edf8b 100644 --- a/keys.example.json +++ b/keys.example.json @@ -1,17 +1,17 @@ -{ - "OPENAI_API_KEY": "", - "OPENAI_ORG_ID": "", - "GEMINI_API_KEY": "", - "ANTHROPIC_API_KEY": "", - "REPLICATE_API_KEY": "", - "GROQCLOUD_API_KEY": "", - "HUGGINGFACE_API_KEY": "", - "QWEN_API_KEY": "", - "XAI_API_KEY": "", - "MISTRAL_API_KEY": "", - "DEEPSEEK_API_KEY": "", - "GHLF_API_KEY": "", - "HYPERBOLIC_API_KEY": "", - "NOVITA_API_KEY": "", - "OPENROUTER_API_KEY": "" -} +{ + "OPENAI_API_KEY": "", + "OPENAI_ORG_ID": "", + "GEMINI_API_KEY": "", + "ANTHROPIC_API_KEY": "", + "REPLICATE_API_KEY": "", + "GROQCLOUD_API_KEY": "", + "HUGGINGFACE_API_KEY": "", + "QWEN_API_KEY": "", + "XAI_API_KEY": "", + "MISTRAL_API_KEY": "", + "DEEPSEEK_API_KEY": "", + "GHLF_API_KEY": "", + "HYPERBOLIC_API_KEY": "", + "NOVITA_API_KEY": "", + "OPENROUTER_API_KEY": "" +} diff --git a/main.js b/main.js index 521aadf..e5db05c 100644 --- a/main.js +++ b/main.js @@ -5,6 +5,7 @@ import { hideBin } from 'yargs/helpers'; import { createMindServer } from './src/server/mind_server.js'; import { mainProxy } from './src/process/main_proxy.js'; import { readFileSync } from 'fs'; +import { initTTS } from './src/process/tts_process.js'; function parseArguments() { return yargs(hideBin(process.argv)) @@ -39,7 +40,7 @@ async function main() { const profiles = getProfiles(args); console.log(profiles); const { load_memory, init_message } = settings; - + for (let i=0; i setTimeout(resolve, 1000)); } + initTTS(); } try { diff --git a/package.json b/package.json index bb3fd90..c713c92 100644 --- a/package.json +++ b/package.json @@ -9,7 +9,8 @@ "cheerio": "^1.0.0", "express": "^4.18.2", "google-translate-api-x": "^10.7.1", - "groq-sdk": "^0.15.0", + "groq-sdk": "^0.5.0", + "mic": "^2.1.2", "minecraft-data": "^3.78.0", "mineflayer": "^4.26.0", "mineflayer-armor-manager": "^2.0.1", @@ -17,6 +18,7 @@ "mineflayer-collectblock": "^1.4.1", "mineflayer-pathfinder": "^2.4.5", "mineflayer-pvp": "^1.3.2", + "naudiodon": "^2.3.6", "node-canvas-webgl": "PrismarineJS/node-canvas-webgl", "openai": "^4.4.0", "patch-package": "^8.0.0", @@ -28,6 +30,7 @@ "socket.io-client": "^4.7.2", "three": "^0.128.0", "vec3": "^0.1.10", + "wav": "^1.0.2", "yargs": "^17.7.2" }, "scripts": { @@ -40,4 +43,4 @@ "eslint-plugin-no-floating-promise": "^2.0.0", "globals": "^15.11.0" } -} +} \ No newline at end of file diff --git a/patches/@google+generative-ai+0.2.1.patch b/patches/@google+generative-ai+0.2.1.patch index ebdff24..68d8ec6 100644 --- a/patches/@google+generative-ai+0.2.1.patch +++ b/patches/@google+generative-ai+0.2.1.patch @@ -1,13 +1,12 @@ diff --git a/node_modules/@google/generative-ai/dist/index.mjs b/node_modules/@google/generative-ai/dist/index.mjs -index 23a175b..aab7e19 100644 --- a/node_modules/@google/generative-ai/dist/index.mjs +++ b/node_modules/@google/generative-ai/dist/index.mjs -@@ -151,7 +151,7 @@ class GoogleGenerativeAIResponseError extends GoogleGenerativeAIError { - * limitations under the License. - */ - const BASE_URL = "https://generativelanguage.googleapis.com"; +@@ -156,1 +156,1 @@ +-const API_VERSION = "v1"; ++const API_VERSION = "v1beta"; +diff --git a/node_modules/@google/generative-ai/dist/index.js b/node_modules/@google/generative-ai/dist/index.js +--- a/node_modules/@google/generative-ai/dist/index.js ++++ b/node_modules/@google/generative-ai/dist/index.js +@@ -156,1 +156,1 @@ -const API_VERSION = "v1"; +const API_VERSION = "v1beta"; - /** - * We can't `require` package.json if this runs on web. We will use rollup to - * swap in the version number here at build time. diff --git a/profiles/llama.json b/profiles/llama.json index ceb3992..2e9cae0 100644 --- a/profiles/llama.json +++ b/profiles/llama.json @@ -7,4 +7,4 @@ "embedding": "openai" -} \ No newline at end of file +} diff --git a/settings.js b/settings.js index f6713ae..cdfc60e 100644 --- a/settings.js +++ b/settings.js @@ -29,7 +29,6 @@ const settings = { "load_memory": false, // load memory from previous session "init_message": "Respond with hello world and your name", // sends to all on spawn "only_chat_with": [], // users that the bots listen to and send general messages to. if empty it will chat publicly - "speak": false, // allows all bots to speak through system text-to-speech. works on windows, mac, on linux you need to `apt install espeak` "language": "en", // translate to/from this language. Supports these language names: https://cloud.google.com/translate/docs/languages "show_bot_views": false, // show bot's view in browser at localhost:3000, 3001... @@ -46,10 +45,15 @@ const settings = { "verbose_commands": true, // show full command syntax "narrate_behavior": true, // chat simple automatic actions ('Picking up item!') "chat_bot_messages": true, // publicly chat messages to other bots + + "stt_transcription": false, // change this to "true" or "false" depending on if you want STT in Mindcraft, STT needs a GroqCloud API key, can be found here: https://console.groq.com/keys + "stt_username": "SYSTEM", // Change this to the username the model will respond to. + "stt_agent_name": "" // Change the name here to whatever your agent is named, if left empty, will send message to all agents. + "speak": false, // allows all bots to speak through system text-to-speech. works on windows, mac, on linux you need to `apt install espeak` - "log_normal_data": false, - "log_reasoning_data": false, - "log_vision_data": false, + "log_normal_data": false, // Logs all inputs / outputs without reasoning or vision data + "log_reasoning_data": false, // Logs only reasoning inputs / outputs + "log_vision_data": false, // Logs only vision inputs / outputs } diff --git a/src/agent/agent.js b/src/agent/agent.js index 0f391e0..e58687d 100644 --- a/src/agent/agent.js +++ b/src/agent/agent.js @@ -20,6 +20,15 @@ import { say } from './speak.js'; export class Agent { async start(profile_fp, load_mem=false, init_message=null, count_id=0, task_path=null, task_id=null) { this.last_sender = null; + // Safely attach agent instance to a global-like object so STT code can access it. + // This works in Node.js ESM or CommonJS. If "global" doesn't exist, fallback to "globalThis". + const globalObj = (typeof global !== 'undefined') ? global : globalThis; + try { + globalObj.agent = this; + } catch(e) { + console.warn("Failed attaching agent to global object:", e); + } + this.latestScreenshotPath = null; this.count_id = count_id; if (!profile_fp) { @@ -126,6 +135,7 @@ export class Agent { }); } + async _setupEventHandlers(save_data, init_message) { const ignore_messages = [ "Set own game mode to", diff --git a/src/models/groq.js b/src/models/groq.js index fefa8c7..de7ebbd 100644 --- a/src/models/groq.js +++ b/src/models/groq.js @@ -1,4 +1,5 @@ import Groq from 'groq-sdk' +import fs from "fs"; import { getKey } from '../utils/keys.js'; import { log, logVision } from '../../logger.js'; @@ -104,3 +105,21 @@ export class GroqCloudAPI { throw new Error('Embeddings are not supported by Groq.'); } } + +export class GroqCloudTTS { + constructor() { + this.groq = new Groq({ apiKey: getKey('GROQCLOUD_API_KEY') }); + } + + async transcribe(filePath, options = {}) { + const transcription = await this.groq.audio.transcriptions.create({ + file: fs.createReadStream(filePath), + model: options.model || "distil-whisper-large-v3-en", // or "whisper-large-v3-turbo" + prompt: options.prompt || "", + response_format: options.response_format || "json", + language: options.language || "en", + temperature: options.temperature !== undefined ? options.temperature : 0.0, + }); + return transcription.text; + } +} diff --git a/src/process/tts_process.js b/src/process/tts_process.js new file mode 100644 index 0000000..5d20259 --- /dev/null +++ b/src/process/tts_process.js @@ -0,0 +1,247 @@ +import settings from '../../settings.js'; +import { GroqCloudTTS } from '../models/groq.js'; +import portAudio from 'naudiodon'; +const { AudioIO, SampleFormat16Bit } = portAudio; +import wav from 'wav'; +import fs from 'fs'; +import path from 'path'; +import { fileURLToPath } from 'url'; + +// Import getIO and our new function getAllInGameAgentNames +import { getIO, getAllInGameAgentNames } from '../server/mind_server.js'; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = path.dirname(__filename); + +/** + * Delete leftover speech_*.wav from previous runs + */ +const leftover = fs.readdirSync(__dirname).filter(f => /^speech_\d+\.wav$/.test(f)); +for (const file of leftover) { + try { + fs.unlinkSync(path.join(__dirname, file)); + } catch (_) { + // ignore errors + } +} + +// Configuration +const RMS_THRESHOLD = 500; // Lower threshold for faint audio +const SILENCE_DURATION = 2000; // 2 seconds of silence after speech => stop +const SAMPLE_RATE = 16000; +const BIT_DEPTH = 16; +const STT_USERNAME = settings.stt_username || "SERVER"; // Name that appears as sender +const STT_AGENT_NAME = settings.stt_agent_name || ""; // If blank, broadcast to all + +// Guards to prevent multiple overlapping recordings +let isRecording = false; // Ensures only one recordAndTranscribeOnce at a time +let sttRunning = false; // Ensures continuousLoop is started only once + +/** + * Records one session, transcribes, and sends to MindServer as a chat message + */ +async function recordAndTranscribeOnce() { + // If another recording is in progress, just skip + if (isRecording) { + console.log("Another recording is still in progress; skipping new record attempt."); + return null; + } + isRecording = true; + + const outFile = path.join(__dirname, `speech_${Date.now()}.wav`); + const fileWriter = new wav.FileWriter(outFile, { + channels: 1, + sampleRate: SAMPLE_RATE, + bitDepth: BIT_DEPTH + }); + const ai = new AudioIO({ + inOptions: { + channelCount: 1, + sampleFormat: SampleFormat16Bit, + sampleRate: SAMPLE_RATE, + deviceId: -1, + closeOnError: true + } + }); + + let recording = true; + let hasHeardSpeech = false; + let silenceTimer = null; + let finished = false; // Guard to ensure final processing is done only once + + // Helper to reset silence timer + function resetSilenceTimer() { + if (silenceTimer) clearTimeout(silenceTimer); + if (hasHeardSpeech) { + silenceTimer = setTimeout(() => stopRecording(), SILENCE_DURATION); + } + } + + // Stop recording + function stopRecording() { + if (!recording) return; + recording = false; + ai.quit(); + fileWriter.end(); + } + + // We wrap everything in a promise so we can await the transcription + return new Promise((resolve, reject) => { + // Attach event handlers + ai.on('data', (chunk) => { + fileWriter.write(chunk); + + // Calculate RMS for threshold detection + let sumSquares = 0; + const sampleCount = chunk.length / 2; + for (let i = 0; i < chunk.length; i += 2) { + const sample = chunk.readInt16LE(i); + sumSquares += sample * sample; + } + const rms = Math.sqrt(sumSquares / sampleCount); + + // If RMS passes threshold, we've heard speech + if (rms > RMS_THRESHOLD) { + if (!hasHeardSpeech) { + hasHeardSpeech = true; + } + resetSilenceTimer(); + } + }); + + ai.on('error', (err) => { + cleanupListeners(); + reject(err); + }); + + fileWriter.on('finish', async () => { + if (finished) return; + finished = true; + try { + // Check audio duration + const stats = fs.statSync(outFile); + const headerSize = 44; // standard WAV header size + const dataSize = stats.size - headerSize; + const duration = dataSize / (SAMPLE_RATE * (BIT_DEPTH / 8)); + if (duration < 2.75) { + console.log("Audio too short (<2.75s); discarding."); + fs.unlink(outFile, () => {}); + cleanupListeners(); + return resolve(null); + } + + // Transcribe + const groqTTS = new GroqCloudTTS(); + const text = await groqTTS.transcribe(outFile, { + model: "distil-whisper-large-v3-en", + prompt: "", + response_format: "json", + language: "en", + temperature: 0.0 + }); + + fs.unlink(outFile, () => {}); // cleanup WAV file + + // Basic check for empty or whitespace + if (!text || !text.trim()) { + console.log("Transcription empty; discarding."); + cleanupListeners(); + return resolve(null); + } + + // Heuristic checks to determine if the transcription is genuine + + // 1. Ensure at least one alphabetical character + if (!/[A-Za-z]/.test(text)) { + console.log("Transcription has no letters; discarding."); + cleanupListeners(); + return resolve(null); + } + + // 2. Check for gibberish repeated sequences + if (/([A-Za-z])\1{3,}/.test(text)) { + console.log("Transcription looks like gibberish; discarding."); + cleanupListeners(); + return resolve(null); + } + + // 3. Check transcription length, with allowed greetings + const letterCount = text.replace(/[^A-Za-z]/g, "").length; + const normalizedText = text.trim().toLowerCase(); + const allowedGreetings = new Set(["hi", "hello", "greetings", "hey"]); + + if (letterCount < 8 && !allowedGreetings.has(normalizedText)) { + console.log("Transcription too short and not an allowed greeting; discarding."); + cleanupListeners(); + return resolve(null); + } + + console.log("Transcription:", text); + + // Format message so it looks like: "[SERVER] message" + const finalMessage = `[${STT_USERNAME}] ${text}`; + + // If STT_AGENT_NAME is empty, broadcast to all agents + if (!STT_AGENT_NAME.trim()) { + const agentNames = getAllInGameAgentNames(); // from mind_server + for (const agentName of agentNames) { + getIO().emit('send-message', agentName, finalMessage); + } + } else { + // Otherwise, send only to the specified agent + getIO().emit('send-message', STT_AGENT_NAME, finalMessage); + } + + cleanupListeners(); + resolve(text); + } catch (err) { + cleanupListeners(); + reject(err); + } + }); + + ai.start(); + + function cleanupListeners() { + ai.removeAllListeners('data'); + ai.removeAllListeners('error'); + fileWriter.removeAllListeners('finish'); + if (silenceTimer) clearTimeout(silenceTimer); + + // release lock + isRecording = false; + } + }); +} + +/** + * Runs recording sessions sequentially, so only one at a time + */ +async function continuousLoop() { + while (true) { + try { + await recordAndTranscribeOnce(); + } catch (err) { + console.error("[STT Error]", err); + } + // short gap + await new Promise(res => setTimeout(res, 1000)); + } +} + +export function initTTS() { + // Only run if stt_transcription is true and we haven't started already + if (!settings.stt_transcription) return; + + if (sttRunning) { + console.log("STT loop already running; skipping re-init."); + return; + } + sttRunning = true; + + continuousLoop().catch((err) => { + console.error("[STT] continuousLoop crashed", err); + }); +} + +initTTS(); diff --git a/src/server/mind_server.js b/src/server/mind_server.js index eed71d7..7bf530a 100644 --- a/src/server/mind_server.js +++ b/src/server/mind_server.js @@ -161,3 +161,6 @@ function stopAllAgents() { export const getIO = () => io; export const getServer = () => server; export const getConnectedAgents = () => connectedAgents; +export function getAllInGameAgentNames() { + return Object.keys(inGameAgents); + }