Merge pull request #7 from Sweaterdog/Speech-to-Text

Speech to text
This commit is contained in:
Sweaterdog 2025-06-07 14:59:42 -07:00 committed by GitHub
commit 4efb5c304f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 335 additions and 33 deletions

View file

@ -120,6 +120,21 @@ When running in docker, if you want the bot to join your local minecraft server,
To connect to an unsupported minecraft version, you can try to use [viaproxy](services/viaproxy/README.md)
## STT in Mindcraft
STT allows you to speak to the model if you have a microphone
STT can be enabled in `settings.js` under the section that looks like this:
```javascript
"stt_transcription": true, // Change this to "true" to enable STT
"stt_username": "SYSTEM",
"stt_agent_name": ""
```
The Text to Speech engine will begin listening on the system default input device.
When using STT, you **need** a [GroqCloud API key](https://console.groq.com/keys) as Groq is used for Audio transcription
# Bot Profiles
Bot profiles are json files (such as `andy.json`) that define:

View file

@ -1,17 +1,17 @@
{
"OPENAI_API_KEY": "",
"OPENAI_ORG_ID": "",
"GEMINI_API_KEY": "",
"ANTHROPIC_API_KEY": "",
"REPLICATE_API_KEY": "",
"GROQCLOUD_API_KEY": "",
"HUGGINGFACE_API_KEY": "",
"QWEN_API_KEY": "",
"XAI_API_KEY": "",
"MISTRAL_API_KEY": "",
"DEEPSEEK_API_KEY": "",
"GHLF_API_KEY": "",
"HYPERBOLIC_API_KEY": "",
"NOVITA_API_KEY": "",
"OPENROUTER_API_KEY": ""
}
{
"OPENAI_API_KEY": "",
"OPENAI_ORG_ID": "",
"GEMINI_API_KEY": "",
"ANTHROPIC_API_KEY": "",
"REPLICATE_API_KEY": "",
"GROQCLOUD_API_KEY": "",
"HUGGINGFACE_API_KEY": "",
"QWEN_API_KEY": "",
"XAI_API_KEY": "",
"MISTRAL_API_KEY": "",
"DEEPSEEK_API_KEY": "",
"GHLF_API_KEY": "",
"HYPERBOLIC_API_KEY": "",
"NOVITA_API_KEY": "",
"OPENROUTER_API_KEY": ""
}

View file

@ -5,6 +5,7 @@ import { hideBin } from 'yargs/helpers';
import { createMindServer } from './src/server/mind_server.js';
import { mainProxy } from './src/process/main_proxy.js';
import { readFileSync } from 'fs';
import { initTTS } from './src/process/tts_process.js';
function parseArguments() {
return yargs(hideBin(process.argv))
@ -39,7 +40,7 @@ async function main() {
const profiles = getProfiles(args);
console.log(profiles);
const { load_memory, init_message } = settings;
for (let i=0; i<profiles.length; i++) {
const agent_process = new AgentProcess();
const profile = readFileSync(profiles[i], 'utf8');
@ -48,6 +49,7 @@ async function main() {
agent_process.start(profiles[i], load_memory, init_message, i, args.task_path, args.task_id);
await new Promise(resolve => setTimeout(resolve, 1000));
}
initTTS();
}
try {

View file

@ -9,7 +9,8 @@
"cheerio": "^1.0.0",
"express": "^4.18.2",
"google-translate-api-x": "^10.7.1",
"groq-sdk": "^0.15.0",
"groq-sdk": "^0.5.0",
"mic": "^2.1.2",
"minecraft-data": "^3.78.0",
"mineflayer": "^4.26.0",
"mineflayer-armor-manager": "^2.0.1",
@ -17,6 +18,7 @@
"mineflayer-collectblock": "^1.4.1",
"mineflayer-pathfinder": "^2.4.5",
"mineflayer-pvp": "^1.3.2",
"naudiodon": "^2.3.6",
"node-canvas-webgl": "PrismarineJS/node-canvas-webgl",
"openai": "^4.4.0",
"patch-package": "^8.0.0",
@ -28,6 +30,7 @@
"socket.io-client": "^4.7.2",
"three": "^0.128.0",
"vec3": "^0.1.10",
"wav": "^1.0.2",
"yargs": "^17.7.2"
},
"scripts": {
@ -40,4 +43,4 @@
"eslint-plugin-no-floating-promise": "^2.0.0",
"globals": "^15.11.0"
}
}
}

View file

@ -1,13 +1,12 @@
diff --git a/node_modules/@google/generative-ai/dist/index.mjs b/node_modules/@google/generative-ai/dist/index.mjs
index 23a175b..aab7e19 100644
--- a/node_modules/@google/generative-ai/dist/index.mjs
+++ b/node_modules/@google/generative-ai/dist/index.mjs
@@ -151,7 +151,7 @@ class GoogleGenerativeAIResponseError extends GoogleGenerativeAIError {
* limitations under the License.
*/
const BASE_URL = "https://generativelanguage.googleapis.com";
@@ -156,1 +156,1 @@
-const API_VERSION = "v1";
+const API_VERSION = "v1beta";
diff --git a/node_modules/@google/generative-ai/dist/index.js b/node_modules/@google/generative-ai/dist/index.js
--- a/node_modules/@google/generative-ai/dist/index.js
+++ b/node_modules/@google/generative-ai/dist/index.js
@@ -156,1 +156,1 @@
-const API_VERSION = "v1";
+const API_VERSION = "v1beta";
/**
* We can't `require` package.json if this runs on web. We will use rollup to
* swap in the version number here at build time.

View file

@ -7,4 +7,4 @@
"embedding": "openai"
}
}

View file

@ -29,7 +29,6 @@ const settings = {
"load_memory": false, // load memory from previous session
"init_message": "Respond with hello world and your name", // sends to all on spawn
"only_chat_with": [], // users that the bots listen to and send general messages to. if empty it will chat publicly
"speak": false, // allows all bots to speak through system text-to-speech. works on windows, mac, on linux you need to `apt install espeak`
"language": "en", // translate to/from this language. Supports these language names: https://cloud.google.com/translate/docs/languages
"show_bot_views": false, // show bot's view in browser at localhost:3000, 3001...
@ -46,10 +45,15 @@ const settings = {
"verbose_commands": true, // show full command syntax
"narrate_behavior": true, // chat simple automatic actions ('Picking up item!')
"chat_bot_messages": true, // publicly chat messages to other bots
"stt_transcription": false, // change this to "true" or "false" depending on if you want STT in Mindcraft, STT needs a GroqCloud API key, can be found here: https://console.groq.com/keys
"stt_username": "SYSTEM", // Change this to the username the model will respond to.
"stt_agent_name": "" // Change the name here to whatever your agent is named, if left empty, will send message to all agents.
"speak": false, // allows all bots to speak through system text-to-speech. works on windows, mac, on linux you need to `apt install espeak`
"log_normal_data": false,
"log_reasoning_data": false,
"log_vision_data": false,
"log_normal_data": false, // Logs all inputs / outputs without reasoning or vision data
"log_reasoning_data": false, // Logs only reasoning inputs / outputs
"log_vision_data": false, // Logs only vision inputs / outputs
}

View file

@ -20,6 +20,15 @@ import { say } from './speak.js';
export class Agent {
async start(profile_fp, load_mem=false, init_message=null, count_id=0, task_path=null, task_id=null) {
this.last_sender = null;
// Safely attach agent instance to a global-like object so STT code can access it.
// This works in Node.js ESM or CommonJS. If "global" doesn't exist, fallback to "globalThis".
const globalObj = (typeof global !== 'undefined') ? global : globalThis;
try {
globalObj.agent = this;
} catch(e) {
console.warn("Failed attaching agent to global object:", e);
}
this.latestScreenshotPath = null;
this.count_id = count_id;
if (!profile_fp) {
@ -126,6 +135,7 @@ export class Agent {
});
}
async _setupEventHandlers(save_data, init_message) {
const ignore_messages = [
"Set own game mode to",

View file

@ -1,4 +1,5 @@
import Groq from 'groq-sdk'
import fs from "fs";
import { getKey } from '../utils/keys.js';
import { log, logVision } from '../../logger.js';
@ -104,3 +105,21 @@ export class GroqCloudAPI {
throw new Error('Embeddings are not supported by Groq.');
}
}
export class GroqCloudTTS {
constructor() {
this.groq = new Groq({ apiKey: getKey('GROQCLOUD_API_KEY') });
}
async transcribe(filePath, options = {}) {
const transcription = await this.groq.audio.transcriptions.create({
file: fs.createReadStream(filePath),
model: options.model || "distil-whisper-large-v3-en", // or "whisper-large-v3-turbo"
prompt: options.prompt || "",
response_format: options.response_format || "json",
language: options.language || "en",
temperature: options.temperature !== undefined ? options.temperature : 0.0,
});
return transcription.text;
}
}

247
src/process/tts_process.js Normal file
View file

@ -0,0 +1,247 @@
import settings from '../../settings.js';
import { GroqCloudTTS } from '../models/groq.js';
import portAudio from 'naudiodon';
const { AudioIO, SampleFormat16Bit } = portAudio;
import wav from 'wav';
import fs from 'fs';
import path from 'path';
import { fileURLToPath } from 'url';
// Import getIO and our new function getAllInGameAgentNames
import { getIO, getAllInGameAgentNames } from '../server/mind_server.js';
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
/**
* Delete leftover speech_*.wav from previous runs
*/
const leftover = fs.readdirSync(__dirname).filter(f => /^speech_\d+\.wav$/.test(f));
for (const file of leftover) {
try {
fs.unlinkSync(path.join(__dirname, file));
} catch (_) {
// ignore errors
}
}
// Configuration
const RMS_THRESHOLD = 500; // Lower threshold for faint audio
const SILENCE_DURATION = 2000; // 2 seconds of silence after speech => stop
const SAMPLE_RATE = 16000;
const BIT_DEPTH = 16;
const STT_USERNAME = settings.stt_username || "SERVER"; // Name that appears as sender
const STT_AGENT_NAME = settings.stt_agent_name || ""; // If blank, broadcast to all
// Guards to prevent multiple overlapping recordings
let isRecording = false; // Ensures only one recordAndTranscribeOnce at a time
let sttRunning = false; // Ensures continuousLoop is started only once
/**
* Records one session, transcribes, and sends to MindServer as a chat message
*/
async function recordAndTranscribeOnce() {
// If another recording is in progress, just skip
if (isRecording) {
console.log("Another recording is still in progress; skipping new record attempt.");
return null;
}
isRecording = true;
const outFile = path.join(__dirname, `speech_${Date.now()}.wav`);
const fileWriter = new wav.FileWriter(outFile, {
channels: 1,
sampleRate: SAMPLE_RATE,
bitDepth: BIT_DEPTH
});
const ai = new AudioIO({
inOptions: {
channelCount: 1,
sampleFormat: SampleFormat16Bit,
sampleRate: SAMPLE_RATE,
deviceId: -1,
closeOnError: true
}
});
let recording = true;
let hasHeardSpeech = false;
let silenceTimer = null;
let finished = false; // Guard to ensure final processing is done only once
// Helper to reset silence timer
function resetSilenceTimer() {
if (silenceTimer) clearTimeout(silenceTimer);
if (hasHeardSpeech) {
silenceTimer = setTimeout(() => stopRecording(), SILENCE_DURATION);
}
}
// Stop recording
function stopRecording() {
if (!recording) return;
recording = false;
ai.quit();
fileWriter.end();
}
// We wrap everything in a promise so we can await the transcription
return new Promise((resolve, reject) => {
// Attach event handlers
ai.on('data', (chunk) => {
fileWriter.write(chunk);
// Calculate RMS for threshold detection
let sumSquares = 0;
const sampleCount = chunk.length / 2;
for (let i = 0; i < chunk.length; i += 2) {
const sample = chunk.readInt16LE(i);
sumSquares += sample * sample;
}
const rms = Math.sqrt(sumSquares / sampleCount);
// If RMS passes threshold, we've heard speech
if (rms > RMS_THRESHOLD) {
if (!hasHeardSpeech) {
hasHeardSpeech = true;
}
resetSilenceTimer();
}
});
ai.on('error', (err) => {
cleanupListeners();
reject(err);
});
fileWriter.on('finish', async () => {
if (finished) return;
finished = true;
try {
// Check audio duration
const stats = fs.statSync(outFile);
const headerSize = 44; // standard WAV header size
const dataSize = stats.size - headerSize;
const duration = dataSize / (SAMPLE_RATE * (BIT_DEPTH / 8));
if (duration < 2.75) {
console.log("Audio too short (<2.75s); discarding.");
fs.unlink(outFile, () => {});
cleanupListeners();
return resolve(null);
}
// Transcribe
const groqTTS = new GroqCloudTTS();
const text = await groqTTS.transcribe(outFile, {
model: "distil-whisper-large-v3-en",
prompt: "",
response_format: "json",
language: "en",
temperature: 0.0
});
fs.unlink(outFile, () => {}); // cleanup WAV file
// Basic check for empty or whitespace
if (!text || !text.trim()) {
console.log("Transcription empty; discarding.");
cleanupListeners();
return resolve(null);
}
// Heuristic checks to determine if the transcription is genuine
// 1. Ensure at least one alphabetical character
if (!/[A-Za-z]/.test(text)) {
console.log("Transcription has no letters; discarding.");
cleanupListeners();
return resolve(null);
}
// 2. Check for gibberish repeated sequences
if (/([A-Za-z])\1{3,}/.test(text)) {
console.log("Transcription looks like gibberish; discarding.");
cleanupListeners();
return resolve(null);
}
// 3. Check transcription length, with allowed greetings
const letterCount = text.replace(/[^A-Za-z]/g, "").length;
const normalizedText = text.trim().toLowerCase();
const allowedGreetings = new Set(["hi", "hello", "greetings", "hey"]);
if (letterCount < 8 && !allowedGreetings.has(normalizedText)) {
console.log("Transcription too short and not an allowed greeting; discarding.");
cleanupListeners();
return resolve(null);
}
console.log("Transcription:", text);
// Format message so it looks like: "[SERVER] message"
const finalMessage = `[${STT_USERNAME}] ${text}`;
// If STT_AGENT_NAME is empty, broadcast to all agents
if (!STT_AGENT_NAME.trim()) {
const agentNames = getAllInGameAgentNames(); // from mind_server
for (const agentName of agentNames) {
getIO().emit('send-message', agentName, finalMessage);
}
} else {
// Otherwise, send only to the specified agent
getIO().emit('send-message', STT_AGENT_NAME, finalMessage);
}
cleanupListeners();
resolve(text);
} catch (err) {
cleanupListeners();
reject(err);
}
});
ai.start();
function cleanupListeners() {
ai.removeAllListeners('data');
ai.removeAllListeners('error');
fileWriter.removeAllListeners('finish');
if (silenceTimer) clearTimeout(silenceTimer);
// release lock
isRecording = false;
}
});
}
/**
* Runs recording sessions sequentially, so only one at a time
*/
async function continuousLoop() {
while (true) {
try {
await recordAndTranscribeOnce();
} catch (err) {
console.error("[STT Error]", err);
}
// short gap
await new Promise(res => setTimeout(res, 1000));
}
}
export function initTTS() {
// Only run if stt_transcription is true and we haven't started already
if (!settings.stt_transcription) return;
if (sttRunning) {
console.log("STT loop already running; skipping re-init.");
return;
}
sttRunning = true;
continuousLoop().catch((err) => {
console.error("[STT] continuousLoop crashed", err);
});
}
initTTS();

View file

@ -161,3 +161,6 @@ function stopAllAgents() {
export const getIO = () => io;
export const getServer = () => server;
export const getConnectedAgents = () => connectedAgents;
export function getAllInGameAgentNames() {
return Object.keys(inGameAgents);
}