Added pollinations TTS support.

2025-08-04 06:15:32 +02:00 · 2025-04-20 18:20:44 +01:00 · 2025-04-20 18:20:44 +01:00 · d263a5fdde
commit d263a5fdde
parent 2b3eb716e0
4 changed files with 114 additions and 30 deletions
--- a/settings.js
+++ b/settings.js
@ -28,8 +28,11 @@ const settings = {
    "load_memory": false, // load memory from previous session
    "init_message": "Respond with hello world and your name", // sends to all on spawn
    "only_chat_with": [], // users that the bots listen to and send general messages to. if empty it will chat publicly
-    "speak": false, // allows all bots to speak through system text-to-speech. works on windows, mac, on linux you need to `apt install espeak`
-    "language": "en", // translate to/from this language. Supports these language names: https://cloud.google.com/translate/docs/languages
+    
+    "speak": true,
+    "speak_model": "pollinations/openai-audio/echo", // allows all bots to speak through text-to-speach. format: {provider}/{model}/{voice}. if set to "system" it will use system text-to-speech, which works on windows and mac, but on linux you need to `apt install espeak`.
+    
+    "language": "en", // translate to/from this language. NOT text-to-speech language. Supports these language names: https://cloud.google.com/translate/docs/languages
    "show_bot_views": false, // show bot's view in browser at localhost:3000, 3001...

    "allow_insecure_coding": false, // allows newAction command and model can write/run code on your computer. enable at own risk
--- a/src/agent/agent.js
+++ b/src/agent/agent.js
@ -15,7 +15,7 @@ import { addBrowserViewer } from './vision/browser_viewer.js';
 import settings from '../../settings.js';
 import { serverProxy } from './agent_proxy.js';
 import { Task } from './tasks.js';
-import { say } from './speak.js';
+import { Speaker } from './speak.js';

 export class Agent {
    async start(profile_fp, load_mem=false, init_message=null, count_id=0, task_path=null, task_id=null) {
--- a/src/agent/speak.js
+++ b/src/agent/speak.js
@ -1,43 +1,75 @@
-import { exec } from 'child_process';
+import { exec, spawn } from 'child_process';
+import settings from '../../settings.js';
+import { Pollinations } from '../models/pollinations.js';

 let speakingQueue = [];
 let isSpeaking = false;

-export function say(textToSpeak) {
-  speakingQueue.push(textToSpeak);
-  if (!isSpeaking) {
-    processQueue();
-  }
+export function say(text) {
+  speakingQueue.push(text);
+  if (!isSpeaking) processQueue();
 }

-function processQueue() {
+async function processQueue() {
  if (speakingQueue.length === 0) {
    isSpeaking = false;
    return;
  }
-
  isSpeaking = true;
-  const textToSpeak = speakingQueue.shift();
-  const isWin = process.platform === "win32";
-  const isMac = process.platform === "darwin";
+  const txt = speakingQueue.shift();

-  let command;
+  const isWin = process.platform === 'win32';
+  const isMac = process.platform === 'darwin';
+  const model = settings.speak_model || 'system';
+
+  if (model === 'system') {
+    // system TTS
+    const cmd = isWin
+      ? `powershell -NoProfile -Command "Add-Type -AssemblyName System.Speech; \
+$s=New-Object System.Speech.Synthesis.SpeechSynthesizer; $s.Rate=2; \
+$s.Speak('${txt.replace(/'/g,"''")}'); $s.Dispose()"`
+      : isMac
+      ? `say "${txt.replace(/"/g,'\\"')}"`
+      : `espeak "${txt.replace(/"/g,'\\"')}"`;
+
+    exec(cmd, err => {
+      if (err) console.error('TTS error', err);
+      processQueue();
+    });
+
+  } else {
+    // remote audio provider
+    const [prov, mdl, voice] = model.split('/');
+    if (prov !== 'pollinations') throw new Error(`Unknown provider: ${prov}`);
+
+    try {
+      const audioData = await new Pollinations(mdl).sendAudioRequest(txt, voice);

      if (isWin) {
-    command = `powershell -Command "Add-Type -AssemblyName System.Speech; $s = New-Object System.Speech.Synthesis.SpeechSynthesizer; $s.Rate = 2; $s.Speak(\\"${textToSpeak}\\"); $s.Dispose()"`;
-  } else if (isMac) {
-    command = `say "${textToSpeak}"`;
+        const ps = `
+          Add-Type -AssemblyName presentationCore;
+          $p=New-Object System.Windows.Media.MediaPlayer;
+          $p.Open([Uri]::new("data:audio/mp3;base64,${audioData}"));
+          $p.Play();
+          Start-Sleep -Seconds [math]::Ceiling($p.NaturalDuration.TimeSpan.TotalSeconds);
+        `;
+        spawn('powershell', ['-NoProfile','-Command', ps], {
+          stdio: 'ignore', detached: true
+        }).unref();
+        processQueue();
+
      } else {
-    command = `espeak "${textToSpeak}"`;
+        const player = spawn('ffplay', ['-nodisp','-autoexit','pipe:0'], {
+          stdio: ['pipe','ignore','ignore']
+        });
+        player.stdin.write(Buffer.from(audioData, 'base64'));
+        player.stdin.end();
+        player.on('exit', processQueue);
      }

-  exec(command, (error, stdout, stderr) => {
-    if (error) {
-      console.error(`Error: ${error.message}`);
-      console.error(`${error.stack}`);
-    } else if (stderr) {
-      console.error(`Error: ${stderr}`);
+    } catch (e) {
+      console.error('Audio error', e);
+      processQueue();
+    }
  }
-    processQueue(); // Continue with the next message in the queue
-  });
 }
--- a/src/models/pollinations.js
+++ b/src/models/pollinations.js
@ -31,7 +31,7 @@ export class Pollinations {
                body: JSON.stringify(payload)
            });
            if (!response.ok) {
-                console.error(`Failed to receive response. Status`, response.status, response.text);
+                console.error(`Failed to receive response. Status`, response.status, (await response.text()));
                res = "My brain disconnected, try again.";
            } else {
                const result = await response.json();
@ -61,5 +61,54 @@ export class Pollinations {

        return this.sendRequest(imageMessages, systemMessage)
    }
+
+    async sendAudioRequest(text, voice) {
+        const fallback = "SUQzBAAAAAAAI1RTU0UAAAAPAAADTGF2ZjU5LjI3LjEwMAAAAAAAAAAAAAAA/+NAwAAAAAAAAAAAAEluZm8AAAAPAAAAAAAAANAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAExhdmM1OS4zNwAAAAAAAAAAAAAAAAAAAAAAAAAAAADQAAAeowAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA==";
+        // ^ 0 second silent audio clip
+    
+        const payload = {
+            model: this.model_name,
+            modalities: ["text", "audio"],
+            audio: {
+                voice: voice,
+                format: "mp3",
+            },
+            messages: [
+                {
+                    role: "developer",
+                    content: "You are an AI that echoes. Your sole function is to repeat back everything the user says to you exactly as it is written. This includes punctuation, grammar, language, and text formatting. Do not add, remove, or alter anything in the user's input in any way. Respond only with an exact duplicate of the user’s query."
+                    // this is required because pollinations attempts to send an AI response to the text instead of just saying the text.
+                },
+                {
+                    role: "user",
+                    content: text
+                }
+            ]
+        }
+
+        let audioData = null;
+
+        try {
+            const response = await fetch(this.url, {
+                method: "POST",
+                headers: {
+                    "Content-Type": "application/json"
+                },
+                body: JSON.stringify(payload)
+            })
+
+            if (!response.ok) {
+                console.error("Failed to get text transcription. Status", response.status, (await response.text()))
+                return fallback
+            }
+
+            const result = await response.json();
+            audioData = result.choices[0].message.audio.data;
+            return audioData;
+        } catch (err) {
+            console.error("TTS fetch failed:", err);
+            return fallback
+        }
+    }
 }