diff --git a/src/agent/agent.js b/src/agent/agent.js
index d69433d..6f8851e 100644
--- a/src/agent/agent.js
+++ b/src/agent/agent.js
@@ -15,7 +15,7 @@ import { addBrowserViewer } from './vision/browser_viewer.js';
 import { serverProxy, sendOutputToServer } from './mindserver_proxy.js';
 import settings from './settings.js';
 import { Task } from './tasks/tasks.js';
-import { say } from './speak.js';
+import { speak } from './speak.js';
 
 export class Agent {
     async start(load_mem=false, init_message=null, count_id=0) {
@@ -384,7 +384,7 @@ export class Agent {
         }
         else {
             if (settings.speak) {
-                say(to_translate, this.prompter.profile.speak_model);
+                speak(to_translate, this.prompter.profile.speak_model);
             }
             if (settings.chat_ingame) {this.bot.chat(message);}
             sendOutputToServer(this.name, message);
diff --git a/src/agent/speak.js b/src/agent/speak.js
index a68735b..003655e 100644
--- a/src/agent/speak.js
+++ b/src/agent/speak.js
@@ -1,107 +1,150 @@
 import { exec, spawn } from 'child_process';
+import { promises as fs } from 'fs';
+import os from 'os';
+import path from 'path';
 import { TTSConfig as gptTTSConfig } from '../models/gpt.js';
 import { TTSConfig as geminiTTSConfig } from '../models/gemini.js';
 
-let speakingQueue = [];
+let speakingQueue = []; // each item: {text, model, audioData, ready}
 let isSpeaking = false;
 
-export function say(text, speak_model) {
-  speakingQueue.push([text, speak_model]);
-  if (!isSpeaking) processQueue();
+export function speak(text, speak_model) {
+    const model = speak_model || 'system';
+
+    const item = { text, model, audioData: null, ready: null };
+
+    if (model === 'system') {
+        // no preprocessing needed
+        item.ready = Promise.resolve();
+    } else {
+    item.ready = fetchRemoteAudio(text, model)
+        .then(data => { item.audioData = data; })
+        .catch(err => { item.error = err; });
+    }
+
+    speakingQueue.push(item);
+    if (!isSpeaking) processQueue();
+}
+
+async function fetchRemoteAudio(txt, model) {
+    function getModelUrl(prov) {
+        if (prov === 'openai') return gptTTSConfig.baseUrl;
+        if (prov === 'google') return geminiTTSConfig.baseUrl;
+        return 'https://api.openai.com/v1';
+    }
+
+    let prov, mdl, voice, url;
+    if (typeof model === 'string') {
+        [prov, mdl, voice] = model.split('/');
+        url = getModelUrl(prov);
+    } else {
+        prov = model.api;
+        mdl = model.model;
+        voice = model.voice;
+        url = model.url || getModelUrl(prov);
+    }
+
+    if (prov === 'openai') {
+        return gptTTSConfig.sendAudioRequest(txt, mdl, voice, url);
+    } else if (prov === 'google') {
+        return geminiTTSConfig.sendAudioRequest(txt, mdl, voice, url);
+    }
+    else {
+        throw new Error(`TTS Provider ${prov} is not supported.`);
+    }
 }
 
 async function processQueue() {
-  if (speakingQueue.length === 0) {
-    isSpeaking = false;
-    return;
-  }
-  isSpeaking = true;
-  const [txt, speak_model] = speakingQueue.shift();
-
-  const isWin = process.platform === 'win32';
-  const isMac = process.platform === 'darwin';
-  const model = speak_model || 'openai/tts-1/echo';
-
-  if (model === 'system') {
-    // system TTS
-    const cmd = isWin
-      ? `powershell -NoProfile -Command "Add-Type -AssemblyName System.Speech; \
-$s=New-Object System.Speech.Synthesis.SpeechSynthesizer; $s.Rate=2; \
-$s.Speak('${txt.replace(/'/g,"''")}'); $s.Dispose()"`
-      : isMac
-      ? `say "${txt.replace(/"/g,'\\"')}"`
-      : `espeak "${txt.replace(/"/g,'\\"')}"`;
-
-    exec(cmd, err => {
-      if (err) console.error('TTS error', err);
-      processQueue();
-    });
-
-  } else {
-
-    function getModelUrl(prov) {
-      if (prov === 'openai') {
-        return gptTTSConfig.baseUrl;
-      } else if (prov === 'google') {
-        return geminiTTSConfig.baseUrl;
-      } else {
-        // fallback
-        return 'https://api.openai.com/v1'
-      }
+    isSpeaking = true;
+    if (speakingQueue.length === 0) {
+        isSpeaking = false;
+        return;
     }
-
-    // remote audio provider
-    let prov, mdl, voice, url;
-    if (typeof model === "string") {
-      [prov, mdl, voice] = model.split('/');
-      url = getModelUrl(prov);
-    } else {
-      prov = model.api;
-      mdl = model.model;
-      voice = model.voice;
-      url = model.url || getModelUrl(prov);
-    }
-
-    try {
-      let audioData;
-      if (prov === "openai") {
-        audioData = await gptTTSConfig.sendAudioRequest(txt, mdl, voice, url);
-      } else if (prov === "google") {
-        audioData = await geminiTTSConfig.sendAudioRequest(txt, mdl, voice, url);
-      } else {
-        throw new Error(`TTS Provider ${prov} is not supported.`);
-      }
-      
-      if (!audioData) {
-        throw new Error("TTS model did not return audio data");
-        // will be handled below
-      }
-
-      if (isWin) {
-        const ps = `
-          Add-Type -AssemblyName presentationCore;
-          $p=New-Object System.Windows.Media.MediaPlayer;
-          $p.Open([Uri]::new("data:audio/mp3;base64,${audioData}"));
-          $p.Play();
-          Start-Sleep -Seconds [math]::Ceiling($p.NaturalDuration.TimeSpan.TotalSeconds);
-        `;
-        spawn('powershell', ['-NoProfile','-Command', ps], {
-          stdio: 'ignore', detached: true
-        }).unref();
+    const item = speakingQueue.shift();
+    const { text: txt, model, audioData } = item;
+    if (txt.trim() === '') {
+        isSpeaking = false;
         processQueue();
-
-      } else {
-        const player = spawn('ffplay', ['-nodisp','-autoexit','pipe:0'], {
-          stdio: ['pipe','ignore','ignore']
-        });
-        player.stdin.write(Buffer.from(audioData, 'base64'));
-        player.stdin.end();
-        player.on('exit', processQueue);
-      }
-
-    } catch (e) {
-      console.error('[TTS] Audio error', e);
-      processQueue();
+        return;
+    }
+
+    const isWin = process.platform === 'win32';
+    const isMac = process.platform === 'darwin';
+
+    // wait for preprocessing if needed
+    try {
+        await item.ready;
+        if (item.error) throw item.error;
+    } catch (err) {
+        console.error('[TTS] preprocess error', err);
+        isSpeaking = false;
+        processQueue();
+        return;
+    }
+
+    if (model === 'system') {
+        // system TTS
+        const cmd = isWin
+            ? `powershell -NoProfile -Command "Add-Type -AssemblyName System.Speech; \
+            $s=New-Object System.Speech.Synthesis.SpeechSynthesizer; $s.Rate=2; \
+            $s.Speak('${txt.replace(/'/g,"''")}'); $s.Dispose()"`
+            : isMac
+            ? `say "${txt.replace(/"/g,'\\"')}"`
+            : `espeak "${txt.replace(/"/g,'\\"')}"`;
+
+        exec(cmd, err => {
+            if (err) console.error('TTS error', err);
+            isSpeaking = false;
+            processQueue();
+        });
+
+    } 
+    else {
+        // audioData was already fetched in speak()
+        const audioData = item.audioData;
+
+        if (!audioData) {
+            console.error('[TTS] No audio data ready');
+            isSpeaking = false;
+            processQueue();
+            return;
+        }
+
+        try {
+            if (isWin) {
+                const tmpPath = path.join(os.tmpdir(), `tts_${Date.now()}.mp3`);
+                await fs.writeFile(tmpPath, Buffer.from(audioData, 'base64'));
+
+                const player = spawn('ffplay', ['-nodisp', '-autoexit', '-loglevel', 'quiet', tmpPath], {
+                    stdio: 'ignore', windowsHide: true
+                });
+                player.on('error', async (err) => {
+                    console.error('[TTS] ffplay error', err);
+                    try { await fs.unlink(tmpPath); } catch {}
+                    isSpeaking = false;
+                    processQueue();
+                });
+                player.on('exit', async () => {
+                    try { await fs.unlink(tmpPath); } catch {}
+                    isSpeaking = false;
+                    processQueue();
+                });
+
+            } else {
+                const player = spawn('ffplay', ['-nodisp','-autoexit','pipe:0'], {
+                    stdio: ['pipe','ignore','ignore']
+                });
+                player.stdin.write(Buffer.from(audioData, 'base64'));
+                player.stdin.end();
+                player.on('exit', () => {
+                    isSpeaking = false;
+                    processQueue();
+                });
+            }
+        } catch (e) {
+            console.error('[TTS] Audio error', e);
+            isSpeaking = false;
+            processQueue();
+        }
     }
-  }
 }
diff --git a/src/models/gpt.js b/src/models/gpt.js
index 142a766..ddf2d95 100644
--- a/src/models/gpt.js
+++ b/src/models/gpt.js
@@ -98,8 +98,6 @@ const sendAudioRequest = async (text, model, voice, url) => {
         input: text
     }
 
-    let audioData = null;
-
     let config = {};
 
     if (url)