Merge da0722a8fb into f2f06fcf3f

2025-08-04 22:35:35 +02:00 · 2025-06-07 17:37:57 -05:00 · 2025-06-07 17:37:57 -05:00 · a354e76dd4
commit a354e76dd4
parent f2f06fcf3f da0722a8fb
34 changed files with 1753 additions and 579 deletions
--- a/README.md
+++ b/README.md
@ -53,7 +53,7 @@ You can configure the agent's name, model, and prompts in their profile like `an
 | `anthropic` | `ANTHROPIC_API_KEY` | `claude-3-haiku-20240307` | [docs](https://docs.anthropic.com/claude/docs/models-overview) |
 | `xai` | `XAI_API_KEY` | `grok-2-1212` | [docs](https://docs.x.ai/docs) |
 | `deepseek` | `DEEPSEEK_API_KEY` | `deepseek-chat` | [docs](https://api-docs.deepseek.com/) |
-| `ollama` (local) | n/a | `ollama/llama3.1` | [docs](https://ollama.com/library) |
+| `ollama` (local) | n/a | `ollama/sweaterdog/andy-4` | [docs](https://ollama.com/library) |
 | `qwen` | `QWEN_API_KEY` | `qwen-max` | [Intl.](https://www.alibabacloud.com/help/en/model-studio/developer-reference/use-qwen-by-calling-api)/[cn](https://help.aliyun.com/zh/model-studio/getting-started/models) |
 | `mistral` | `MISTRAL_API_KEY` | `mistral-large-latest` | [docs](https://docs.mistral.ai/getting-started/models/models_overview/) |
 | `replicate` | `REPLICATE_API_KEY` | `replicate/meta/meta-llama-3-70b-instruct` | [docs](https://replicate.com/collections/language-models) |
@ -66,7 +66,25 @@ You can configure the agent's name, model, and prompts in their profile like `an
 | `vllm` | n/a | `vllm/llama3` | n/a |

 If you use Ollama, to install the models used by default (generation and embedding), execute the following terminal command:
-`ollama pull llama3.1 && ollama pull nomic-embed-text`
+`ollama pull sweaterdog/andy-4 && ollama pull nomic-embed-text`
+<details>
+  <summary>Additional info about Andy-4...</summary>
+  
+  ![image](https://github.com/user-attachments/assets/215afd01-3671-4bb6-b53f-4e51e710239a)
+
+
+  Andy-4 is a community made, open-source model made by Sweaterdog to play Minecraft.
+  Since Andy-4 is open-source, which means you can download the model, and play with it offline and for free.
+
+  The Andy-4 collection of models has reasoning and non-reasoning modes, sometimes the model will reason automatically without being prompted.
+  If you want to specifically enable reasoning, use the `andy-4-reasoning.json` profile.
+  Some Andy-4 models may not be able to disable reasoning, no matter what profile is used.
+
+  Andy-4 has many different models, and come in different sizes.
+  For more information about which model size is best for you, check [Sweaterdog's Ollama page](https://ollama.com/Sweaterdog/Andy-4)
+
+  If you have any Issues, join the Mindcraft server, and ping `@Sweaterdog` with your issue, or leave an issue on the [Andy-4 huggingface repo](https://huggingface.co/Sweaterdog/Andy-4/discussions/new)
+</details>

 ### Online Servers
 To connect to online servers your bot will need an official Microsoft/Minecraft account. You can use your own personal one, but will need another account if you want to connect too and play with it. To connect, change these lines in `settings.js`:
@ -102,6 +120,21 @@ When running in docker, if you want the bot to join your local minecraft server,

 To connect to an unsupported minecraft version, you can try to use [viaproxy](services/viaproxy/README.md)

+## STT in Mindcraft
+
+STT allows you to speak to the model if you have a microphone
+
+STT can be enabled in `settings.js` under the section that looks like this:
+```javascript
+    "stt_transcription": true, // Change this to "true" to enable STT
+    "stt_username": "SYSTEM",
+    "stt_agent_name": ""
+```
+
+The Text to Speech engine will begin listening on the system default input device.
+
+When using STT, you **need** a [GroqCloud API key](https://console.groq.com/keys) as Groq is used for Audio transcription
+
 # Bot Profiles

 Bot profiles are json files (such as `andy.json`) that define:
--- a/logger.js
+++ b/logger.js
@ -0,0 +1,401 @@
+// --- START OF FILE logger.js ---
+
+import { writeFileSync, mkdirSync, existsSync, appendFileSync, readFileSync } from 'fs';
+import { join } from 'path';
+import settings from './settings.js'; // Import settings
+import path from 'path'; // Needed for path operations
+
+// --- Configuration ---
+const LOGS_DIR = './logs';
+const VISION_DATASET_DIR = join(LOGS_DIR, 'vision_dataset'); // HuggingFace dataset format
+const VISION_IMAGES_DIR = join(VISION_DATASET_DIR, 'images'); // Images subdirectory
+
+// --- Log File Paths ---
+const REASONING_LOG_FILE = join(LOGS_DIR, 'reasoning_logs.csv');
+const NORMAL_LOG_FILE = join(LOGS_DIR, 'normal_logs.csv');
+const VISION_METADATA_FILE = join(VISION_DATASET_DIR, 'metadata.jsonl'); // HF metadata format
+
+// --- Log Headers ---
+const TEXT_LOG_HEADER = 'input,output\n';
+
+// --- Log Counters ---
+let logCounts = {
+    normal: 0,
+    reasoning: 0,
+    vision: 0,
+    total: 0,
+    skipped_disabled: 0,
+    skipped_empty: 0,
+    vision_images_saved: 0,
+};
+
+// --- Helper Functions ---
+function ensureDirectoryExistence(dirPath) {
+    if (!existsSync(dirPath)) {
+        try {
+            mkdirSync(dirPath, { recursive: true });
+            console.log(`[Logger] Created directory: ${dirPath}`);
+        } catch (error) {
+            console.error(`[Logger] Error creating directory ${dirPath}:`, error);
+            return false;
+        }
+    }
+    return true;
+}
+
+function countLogEntries(logFile) {
+    if (!existsSync(logFile)) return 0;
+    try {
+        const data = readFileSync(logFile, 'utf8');
+        const lines = data.split('\n').filter(line => line.trim());
+        // Check if the first line looks like a header before subtracting
+        const hasHeader = lines.length > 0 && lines[0].includes(',');
+        return Math.max(0, hasHeader ? lines.length - 1 : lines.length);
+    } catch (err) {
+        console.error(`[Logger] Error reading log file ${logFile}:`, err);
+        return 0;
+    }
+}
+
+
+function ensureLogFile(logFile, header) {
+     if (!ensureDirectoryExistence(path.dirname(logFile))) return false; // Ensure parent dir exists
+
+     if (!existsSync(logFile)) {
+        try {
+            writeFileSync(logFile, header);
+            console.log(`[Logger] Created log file: ${logFile}`);
+        } catch (error) {
+            console.error(`[Logger] Error creating log file ${logFile}:`, error);
+            return false;
+        }
+    } else {
+         try {
+            const content = readFileSync(logFile, 'utf-8');
+            const headerLine = header.split('\n')[0];
+            // If file is empty or header doesn't match, overwrite/create header
+            if (!content.trim() || !content.startsWith(headerLine)) {
+                 // Attempt to prepend header if file has content but wrong/no header
+                 if(content.trim() && !content.startsWith(headerLine)) {
+                    console.warn(`[Logger] Log file ${logFile} seems to be missing or has an incorrect header. Prepending correct header.`);
+                    writeFileSync(logFile, header + content);
+                 } else {
+                    // File is empty or correctly headed, just ensure header is there
+                     writeFileSync(logFile, header);
+                 }
+                 console.log(`[Logger] Ensured header in log file: ${logFile}`);
+            }
+        } catch (error) {
+            console.error(`[Logger] Error checking/writing header for log file ${logFile}:`, error);
+            // Proceed cautiously, maybe log an error and continue?
+        }
+    }
+    return true;
+}
+
+
+function writeToLogFile(logFile, csvEntry) {
+    try {
+        appendFileSync(logFile, csvEntry);
+        // console.log(`[Logger] Logged data to ${logFile}`); // Keep console less noisy
+    } catch (error) {
+        console.error(`[Logger] Error writing to CSV log file ${logFile}:`, error);
+    }
+}
+
+// --- Auto-Detection for Log Type (Based on Response Content) ---
+function determineLogType(response) {
+    // Reasoning check: needs <think>...</think> but ignore the specific 'undefined' placeholder
+    const isReasoning = response.includes('<think>') && response.includes('</think>') && !response.includes('<think>\nundefined</think>');
+
+    if (isReasoning) {
+        return 'reasoning';
+    } else {
+        return 'normal';
+    }
+}
+
+function sanitizeForCsv(value) {
+    if (typeof value !== 'string') {
+        value = String(value);
+    }
+    // Escape double quotes by doubling them and enclose the whole string in double quotes
+    return `"${value.replace(/"/g, '""')}"`;
+}
+
+// Helper function to clean reasoning markers from input
+function cleanReasoningMarkers(input) {
+    if (typeof input !== 'string') {
+        return input;
+    }
+    
+    // Remove /think and /no_think markers
+    return input.replace(/\/think/g, '').replace(/\/no_think/g, '').trim();
+}
+
+// --- Main Logging Function (for text-based input/output) ---
+export function log(input, response) {
+    const trimmedInputStr = input ? (typeof input === 'string' ? input.trim() : JSON.stringify(input)) : "";
+    const trimmedResponse = response ? String(response).trim() : ""; // Ensure response is a string
+
+    // Clean reasoning markers from input before logging
+    const cleanedInput = cleanReasoningMarkers(trimmedInputStr);
+
+    // Basic filtering
+    if (!cleanedInput && !trimmedResponse) {
+        logCounts.skipped_empty++;
+        return;
+    }
+    if (cleanedInput === trimmedResponse) {
+         logCounts.skipped_empty++;
+        return;
+    }
+     // Avoid logging common error messages that aren't useful training data
+    const errorMessages = [
+        "My brain disconnected, try again.",
+        "My brain just kinda stopped working. Try again.",
+        "I thought too hard, sorry, try again.",
+        "*no response*",
+        "No response received.",
+        "No response data.",
+        "Failed to send", // Broader match
+        "Error:", // Broader match
+        "Vision is only supported",
+        "Context length exceeded",
+        "Image input modality is not enabled",
+        "An unexpected error occurred",
+        // Add more generic errors/placeholders as needed
+    ];
+    // Also check for responses that are just the input repeated (sometimes happens with errors)
+    if (errorMessages.some(err => trimmedResponse.includes(err)) || trimmedResponse === cleanedInput) {
+        logCounts.skipped_empty++;
+        // console.warn(`[Logger] Skipping log due to error/placeholder/repeat: "${trimmedResponse.substring(0, 70)}..."`);
+        return;
+    }
+
+
+    const logType = determineLogType(trimmedResponse);
+    let logFile;
+    let header;
+    let settingFlag;
+
+    switch (logType) {
+        case 'reasoning':
+            logFile = REASONING_LOG_FILE;
+            header = TEXT_LOG_HEADER;
+            settingFlag = settings.log_reasoning_data;
+            break;
+        case 'normal':
+        default:
+            logFile = NORMAL_LOG_FILE;
+            header = TEXT_LOG_HEADER;
+            settingFlag = settings.log_normal_data;
+            break;
+    }
+
+    // Check if logging for this type is enabled
+    if (!settingFlag) {
+        logCounts.skipped_disabled++;
+        return;
+    }
+
+    // Ensure directory and file exist
+    if (!ensureLogFile(logFile, header)) return; // ensureLogFile now checks parent dir too
+
+    // Prepare the CSV entry using the sanitizer with cleaned input
+    const safeInput = sanitizeForCsv(cleanedInput);
+    const safeResponse = sanitizeForCsv(trimmedResponse);
+    const csvEntry = `${safeInput},${safeResponse}\n`;
+
+    // Write to the determined log file
+    writeToLogFile(logFile, csvEntry);
+
+    // Update counts
+    logCounts[logType]++;
+    logCounts.total++; // Total here refers to text logs primarily
+
+    // Display summary periodically (based on total text logs)
+    if (logCounts.normal + logCounts.reasoning > 0 && (logCounts.normal + logCounts.reasoning) % 20 === 0) {
+       printSummary();
+    }
+}
+
+// --- Enhanced Vision Logging Function for HuggingFace Dataset Format ---
+export function logVision(conversationHistory, imageBuffer, response, visionMessage = null) {
+    if (!settings.log_vision_data) {
+        logCounts.skipped_disabled++;
+        return;
+    }
+
+    const trimmedResponse = response ? String(response).trim() : "";
+    
+    if (!conversationHistory || conversationHistory.length === 0 || !trimmedResponse || !imageBuffer) {
+        logCounts.skipped_empty++;
+        return;
+    }
+
+    // Filter out error messages
+    const errorMessages = [
+        "My brain disconnected, try again.",
+        "My brain just kinda stopped working. Try again.",
+        "I thought too hard, sorry, try again.",
+        "*no response*",
+        "No response received.",
+        "No response data.",
+        "Failed to send",
+        "Error:",
+        "Vision is only supported",
+        "Context length exceeded",
+        "Image input modality is not enabled",
+        "An unexpected error occurred",
+    ];
+    
+    if (errorMessages.some(err => trimmedResponse.includes(err))) {
+        logCounts.skipped_empty++;
+        return;
+    }
+
+    // Ensure directories exist
+    if (!ensureDirectoryExistence(VISION_DATASET_DIR)) return;
+    if (!ensureDirectoryExistence(VISION_IMAGES_DIR)) return;
+
+    try {
+        // Generate unique filename for the image
+        const timestamp = Date.now();
+        const randomSuffix = Math.random().toString(36).substring(2, 8);
+        const imageFilename = `vision_${timestamp}_${randomSuffix}.jpg`;
+        const imagePath = join(VISION_IMAGES_DIR, imageFilename);
+        const relativeImagePath = `images/${imageFilename}`; // Relative path for metadata
+
+        // Save the image
+        writeFileSync(imagePath, imageBuffer);
+        logCounts.vision_images_saved++;
+
+        // Extract the actual message sent with the image
+        // This is typically the vision prompt/instruction
+        let inputMessage = visionMessage;
+        if (!inputMessage && conversationHistory.length > 0) {
+            // Try to get the last user message or system message
+            const lastMessage = conversationHistory[conversationHistory.length - 1];
+            if (typeof lastMessage.content === 'string') {
+                inputMessage = lastMessage.content;
+            } else if (Array.isArray(lastMessage.content)) {
+                // Find text content in the message
+                const textContent = lastMessage.content.find(c => c.type === 'text');
+                inputMessage = textContent ? textContent.text : '';
+            }
+        }
+
+        // Fallback to conversation history if no specific message
+        if (!inputMessage) {
+            inputMessage = formatConversationInput(conversationHistory);
+        }
+
+        // Create metadata entry in JSONL format for HuggingFace
+        const metadataEntry = {
+            file_name: relativeImagePath,
+            text: inputMessage,
+            response: trimmedResponse,
+            timestamp: timestamp
+        };
+
+        // Append to metadata JSONL file
+        const jsonlLine = JSON.stringify(metadataEntry) + '\n';
+        appendFileSync(VISION_METADATA_FILE, jsonlLine);
+        
+        logCounts.vision++;
+        logCounts.total++;
+
+        // Display summary periodically
+        if (logCounts.vision > 0 && logCounts.vision % 10 === 0) {
+            printSummary();
+        }
+
+    } catch (error) {
+        console.error(`[Logger] Error logging vision data:`, error);
+    }
+}
+
+// Helper function to format conversation history as fallback
+function formatConversationInput(conversationHistory) {
+    if (!conversationHistory || conversationHistory.length === 0) return '';
+    
+    const formattedHistory = [];
+    
+    for (const turn of conversationHistory) {
+        const formattedTurn = {
+            role: turn.role || 'user',
+            content: []
+        };
+
+        // Handle different content formats
+        if (typeof turn.content === 'string') {
+            formattedTurn.content.push({
+                type: 'text',
+                text: turn.content
+            });
+        } else if (Array.isArray(turn.content)) {
+            // Already in the correct format
+            formattedTurn.content = turn.content;
+        } else if (turn.content && typeof turn.content === 'object') {
+            // Convert object to array format
+            if (turn.content.text) {
+                formattedTurn.content.push({
+                    type: 'text',
+                    text: turn.content.text
+                });
+            }
+            if (turn.content.image) {
+                formattedTurn.content.push({
+                    type: 'image',
+                    image: turn.content.image
+                });
+            }
+        }
+
+        formattedHistory.push(formattedTurn);
+    }
+    
+    return JSON.stringify(formattedHistory);
+}
+
+function printSummary() {
+    const totalStored = logCounts.normal + logCounts.reasoning + logCounts.vision;
+    console.log('\n' + '='.repeat(60));
+    console.log('LOGGER SUMMARY');
+    console.log('-'.repeat(60));
+    console.log(`Normal logs stored:    ${logCounts.normal}`);
+    console.log(`Reasoning logs stored: ${logCounts.reasoning}`);
+    console.log(`Vision logs stored:    ${logCounts.vision} (Images saved: ${logCounts.vision_images_saved})`);
+    console.log(`Skipped (disabled):    ${logCounts.skipped_disabled}`);
+    console.log(`Skipped (empty/err):   ${logCounts.skipped_empty}`);
+    console.log('-'.repeat(60));
+    console.log(`Total logs stored:     ${totalStored}`);
+    console.log('='.repeat(60) + '\n');
+}
+
+// Initialize counts at startup
+function initializeCounts() {
+    logCounts.normal = countLogEntries(NORMAL_LOG_FILE);
+    logCounts.reasoning = countLogEntries(REASONING_LOG_FILE);
+    logCounts.vision = countVisionEntries(VISION_METADATA_FILE);
+    // Total count will be accumulated during runtime
+    console.log(`[Logger] Initialized log counts: Normal=${logCounts.normal}, Reasoning=${logCounts.reasoning}, Vision=${logCounts.vision}`);
+}
+
+function countVisionEntries(metadataFile) {
+    if (!existsSync(metadataFile)) return 0;
+    try {
+        const data = readFileSync(metadataFile, 'utf8');
+        const lines = data.split('\n').filter(line => line.trim());
+        return lines.length;
+    } catch (err) {
+        console.error(`[Logger] Error reading vision metadata file ${metadataFile}:`, err);
+        return 0;
+    }
+}
+
+// Initialize counts at startup
+initializeCounts();
+
+// --- END OF FILE logger.js ---
--- a/main.js
+++ b/main.js
@ -5,6 +5,7 @@ import { hideBin } from 'yargs/helpers';
 import { createMindServer } from './src/server/mind_server.js';
 import { mainProxy } from './src/process/main_proxy.js';
 import { readFileSync } from 'fs';
+import { initTTS } from './src/process/tts_process.js';

 function parseArguments() {
    return yargs(hideBin(process.argv))
@ -48,6 +49,7 @@ async function main() {
        agent_process.start(profiles[i], load_memory, init_message, i, args.task_path, args.task_id);
        await new Promise(resolve => setTimeout(resolve, 1000));
    }
+    initTTS();
 }

 try {
--- a/package.json
+++ b/package.json
@ -9,7 +9,8 @@
        "cheerio": "^1.0.0",
        "express": "^4.18.2",
        "google-translate-api-x": "^10.7.1",
-        "groq-sdk": "^0.15.0",
+        "groq-sdk": "^0.5.0",
+        "mic": "^2.1.2",
        "minecraft-data": "^3.78.0",
        "mineflayer": "^4.26.0",
        "mineflayer-armor-manager": "^2.0.1",
@ -17,6 +18,7 @@
        "mineflayer-collectblock": "^1.4.1",
        "mineflayer-pathfinder": "^2.4.5",
        "mineflayer-pvp": "^1.3.2",
+        "naudiodon": "^2.3.6",
        "node-canvas-webgl": "PrismarineJS/node-canvas-webgl",
        "openai": "^4.4.0",
        "patch-package": "^8.0.0",
@ -28,6 +30,7 @@
        "socket.io-client": "^4.7.2",
        "three": "^0.128.0",
        "vec3": "^0.1.10",
+        "wav": "^1.0.2",
        "yargs": "^17.7.2"
    },
    "scripts": {
--- a/patches/@google+generative-ai+0.2.1.patch
+++ b/patches/@google+generative-ai+0.2.1.patch
@ -1,13 +1,12 @@
 diff --git a/node_modules/@google/generative-ai/dist/index.mjs b/node_modules/@google/generative-ai/dist/index.mjs
-index 23a175b..aab7e19 100644
 --- a/node_modules/@google/generative-ai/dist/index.mjs
 +++ b/node_modules/@google/generative-ai/dist/index.mjs
-@@ -151,7 +151,7 @@ class GoogleGenerativeAIResponseError extends GoogleGenerativeAIError {
-  * limitations under the License.
-  */
- const BASE_URL = "https://generativelanguage.googleapis.com";
+@@ -156,1 +156,1 @@
+-const API_VERSION = "v1";
+const API_VERSION = "v1beta";
+diff --git a/node_modules/@google/generative-ai/dist/index.js b/node_modules/@google/generative-ai/dist/index.js
+--- a/node_modules/@google/generative-ai/dist/index.js
+++ b/node_modules/@google/generative-ai/dist/index.js
+@@ -156,1 +156,1 @@
 -const API_VERSION = "v1";
 +const API_VERSION = "v1beta";
- /**
-  * We can't `require` package.json if this runs on web. We will use rollup to
-  * swap in the version number here at build time.
--- a/profiles/andy-4-reasoning.json
+++ b/profiles/andy-4-reasoning.json
@ -0,0 +1,14 @@
+{
+    "name": "Andy-4",
+
+    "model": "ollama/sweaterdog/andy-4",
+    
+    "conversing": "You are a playful Minecraft bot named $NAME that can converse with players, see, move, mine, build, and interact with the world by using commands.\n$SELF_PROMPT Act human-like as if you were a typical Minecraft player, rather than an AI. Be very brief in your responses, don't apologize constantly, don't give instructions or make lists unless asked, and don't refuse requests. Think in high amounts before responding. Don't pretend to act, use commands immediately when requested. Do NOT say this: 'Sure, I've stopped.', instead say this: 'Sure, I'll stop. !stop'. Do NOT say this: 'On my way! Give me a moment.', instead say this: 'On my way! !goToPlayer(\"playername\", 3)'. Respond only as $NAME, never output '(FROM OTHER BOT)' or pretend to be someone else. If you have nothing to say or do, respond with an just a tab '\t'. This is extremely important to me, take a deep breath and have fun :)\nSummarized memory:'$MEMORY'\n$STATS\n$INVENTORY\n$COMMAND_DOCS\n$EXAMPLES\nReason before responding. Conversation Begin:",
+
+    "coding": "You are an intelligent mineflayer bot $NAME that plays minecraft by writing javascript codeblocks. Given the conversation, use the provided skills and world functions to write a js codeblock that controls the mineflayer bot ``` // using this syntax ```. The code will be executed and you will receive it's output. If an error occurs, write another codeblock and try to fix the problem. Be maximally efficient, creative, and correct. Be mindful of previous actions. Do not use commands !likeThis, only use codeblocks. The code is asynchronous and MUST USE AWAIT for all async function calls, and must contain at least one await. You have `Vec3`, `skills`, and `world` imported, and the mineflayer `bot` is given. Do not import other libraries. Think deeply before responding. Do not use setTimeout or setInterval. Do not speak conversationally, only use codeblocks. Do any planning in comments. This is extremely important to me, think step-by-step, take a deep breath and good luck! \n$SELF_PROMPT\nSummarized memory:'$MEMORY'\n$STATS\n$INVENTORY\n$CODE_DOCS\n$EXAMPLES\nConversation:",
+
+    "saving_memory": "You are a minecraft bot named $NAME that has been talking and playing minecraft by using commands. Update your memory by summarizing the following conversation and your old memory in your next response. Prioritize preserving important facts, things you've learned, useful tips, and long term reminders. Do Not record stats, inventory, or docs! Only save transient information from your chat history. You're limited to 500 characters, so be extremely brief, think about what you will summarize before responding, minimize words, and provide your summarization in Chinese. Compress useful information. \nOld Memory: '$MEMORY'\nRecent conversation: \n$TO_SUMMARIZE\nSummarize your old memory and recent conversation into a new memory, and respond only with the unwrapped memory text: ",
+    
+    "bot_responder": "You are a minecraft bot named $NAME that is currently in conversation with another AI bot. Both of you can take actions with the !command syntax, and actions take time to complete. You are currently busy with the following action: '$ACTION' but have received a new message. Decide whether to 'respond' immediately or 'ignore' it and wait for your current action to finish. Be conservative and only respond when necessary, like when you need to change/stop your action, or convey necessary information. Example 1: You:Building a house! !newAction('Build a house.').\nOther Bot: 'Come here!'\nYour decision: ignore\nExample 2: You:Collecting dirt !collectBlocks('dirt',10).\nOther Bot: 'No, collect some wood instead.'\nYour decision: respond\nExample 3: You:Coming to you now. !goToPlayer('billy',3).\nOther Bot: 'What biome are you in?'\nYour decision: respond\nActual Conversation: $TO_SUMMARIZE\nDecide by outputting ONLY 'respond' or 'ignore', nothing else. Your decision:"
+
+}
--- a/profiles/andy-4.json
+++ b/profiles/andy-4.json
@ -0,0 +1,7 @@
+{
+    "name": "andy-4",
+
+    "model": "ollama/sweaterdog/andy-4",
+    
+    "embedding": "ollama"
+}
--- a/settings.js
+++ b/settings.js
@ -21,6 +21,7 @@ const settings = {
        // "./profiles/grok.json",
        // "./profiles/mistral.json",
        // "./profiles/deepseek.json",
+        // "./profiles/andy-4.json",

        // using more than 1 profile requires you to /msg each bot indivually
        // individual profiles override values from the base profile
@ -28,12 +29,12 @@ const settings = {
    "load_memory": false, // load memory from previous session
    "init_message": "Respond with hello world and your name", // sends to all on spawn
    "only_chat_with": [], // users that the bots listen to and send general messages to. if empty it will chat publicly
-    "speak": false, // allows all bots to speak through system text-to-speech. works on windows, mac, on linux you need to `apt install espeak`
    "language": "en", // translate to/from this language. Supports these language names: https://cloud.google.com/translate/docs/languages
    "show_bot_views": false, // show bot's view in browser at localhost:3000, 3001...

    "allow_insecure_coding": false, // allows newAction command and model can write/run code on your computer. enable at own risk
    "allow_vision": false, // allows vision model to interpret screenshots as inputs
+    "vision_mode": "prompted", // "off", "prompted", or "always"
    "blocked_actions" : ["!checkBlueprint", "!checkBlueprintLevel", "!getBlueprint", "!getBlueprintLevel"] , // commands to disable and remove from docs. Ex: ["!setMode"]
    "code_timeout_mins": -1, // minutes code is allowed to run. -1 for no timeout
    "relevant_docs_count": 5, // number of relevant code function docs to select for prompting. -1 for all
@ -44,7 +45,16 @@ const settings = {
    "verbose_commands": true, // show full command syntax
    "narrate_behavior": true, // chat simple automatic actions ('Picking up item!')
    "chat_bot_messages": true, // publicly chat messages to other bots
-    "log_all_prompts": false, // log ALL prompts to file
+
+    "stt_transcription": false, // change this to "true" or "false" depending on if you want STT in Mindcraft, STT needs a GroqCloud API key, can be found here: https://console.groq.com/keys
+    "stt_username": "SYSTEM", // Change this to the username the model will respond to.
+    "stt_agent_name": "" // Change the name here to whatever your agent is named, if left empty, will send message to all agents.
+    "speak": false, // allows all bots to speak through system text-to-speech. works on windows, mac, on linux you need to `apt install espeak`
+    
+    "log_normal_data": false, // Logs all inputs / outputs without reasoning or vision data
+    "log_reasoning_data": false, // Logs only reasoning inputs / outputs
+    "log_vision_data": false, // Logs only vision inputs / outputs
+    
 }

 // these environment variables override certain settings
@ -69,8 +79,5 @@ if (process.env.MAX_MESSAGES) {
 if (process.env.NUM_EXAMPLES) {
    settings.num_examples = process.env.NUM_EXAMPLES;
 }
-if (process.env.LOG_ALL) {
-    settings.log_all_prompts = process.env.LOG_ALL;
-}

 export default settings;
--- a/src/agent/agent.js
+++ b/src/agent/agent.js
@ -20,6 +20,16 @@ import { say } from './speak.js';
 export class Agent {
    async start(profile_fp, load_mem=false, init_message=null, count_id=0, task_path=null, task_id=null) {
        this.last_sender = null;
+        // Safely attach agent instance to a global-like object so STT code can access it.
+        // This works in Node.js ESM or CommonJS. If "global" doesn't exist, fallback to "globalThis".
+        const globalObj = (typeof global !== 'undefined') ? global : globalThis;
+        try {
+            globalObj.agent = this;
+        } catch(e) {
+            console.warn("Failed attaching agent to global object:", e);
+        }
+        
+        this.latestScreenshotPath = null;
        this.count_id = count_id;
        if (!profile_fp) {
            throw new Error('No profile filepath provided');
@ -116,7 +126,7 @@ export class Agent {
                this.checkAllPlayersPresent();
              
                console.log('Initializing vision intepreter...');
-                this.vision_interpreter = new VisionInterpreter(this, settings.allow_vision);
+                this.vision_interpreter = new VisionInterpreter(this, settings.vision_mode);

            } catch (error) {
                console.error('Error in spawn event:', error);
@ -125,6 +135,7 @@ export class Agent {
        });
    }

+
    async _setupEventHandlers(save_data, init_message) {
        const ignore_messages = [
            "Set own game mode to",
@ -172,7 +183,8 @@ export class Agent {

        if (save_data?.self_prompt) {
            if (init_message) {
-                this.history.add('system', init_message);
+                // Assuming init_message for self_prompt loading doesn't have an image
+                await this.history.add('system', init_message, null);
            }
            await this.self_prompter.handleLoad(save_data.self_prompt, save_data.self_prompting_state);
        }
@ -246,6 +258,15 @@ export class Agent {
        const from_other_bot = convoManager.isOtherAgent(source);

        if (!self_prompt && !from_other_bot) { // from user, check for forced commands
+            if (settings.vision_mode === 'always' && this.vision_interpreter && this.vision_interpreter.camera) {
+                try {
+                    const screenshotFilename = await this.vision_interpreter.camera.capture();
+                    this.latestScreenshotPath = screenshotFilename;
+                    console.log(`[${this.name}] Captured screenshot in always_active mode: ${screenshotFilename}`);
+                } catch (error) {
+                    console.error(`[${this.name}] Error capturing screenshot in always_active mode:`, error);
+                }
+            }
            const user_command_name = containsCommand(message);
            if (user_command_name) {
                if (!commandExists(user_command_name)) {
@ -256,7 +277,16 @@ export class Agent {
                if (user_command_name === '!newAction') {
                    // all user-initiated commands are ignored by the bot except for this one
                    // add the preceding message to the history to give context for newAction
-                    this.history.add(source, message);
+                    // This is the user's message that contains the !newAction command.
+                    // If a screenshot was taken due to always, it should be associated here.
+                    let imagePathForNewActionCmd = null;
+                    if (settings.vision_mode === 'always' && this.latestScreenshotPath && !self_prompt && !from_other_bot) {
+                        imagePathForNewActionCmd = this.latestScreenshotPath;
+                    }
+                    await this.history.add(source, message, imagePathForNewActionCmd);
+                    if (imagePathForNewActionCmd) {
+                        this.latestScreenshotPath = null; // Consume path
+                    }
                }
                let execute_res = await executeCommand(this, message);
                if (execute_res) 
@ -281,11 +311,29 @@ export class Agent {
                behavior_log = '...' + behavior_log.substring(behavior_log.length - MAX_LOG);
            }
            behavior_log = 'Recent behaviors log: \n' + behavior_log;
-            await this.history.add('system', behavior_log);
+            await this.history.add('system', behavior_log, null); // Behavior log unlikely to have an image
        }

-        // Handle other user messages
-        await this.history.add(source, message);
+        // Handle other user messages (or initial system messages)
+        let imagePathForInitialMessage = null;
+        if (!self_prompt && !from_other_bot) {
+            // If it's a user message and a screenshot was auto-captured for always
+            if (settings.vision_mode === 'always' && this.latestScreenshotPath) {
+                imagePathForInitialMessage = this.latestScreenshotPath;
+            }
+        } else if (source === 'system' && this.latestScreenshotPath && message.startsWith("You died at position")) {
+            // Example: System death message might use a path if set by some (future) death-capture logic
+            // For now, this is illustrative; death messages don't set latestScreenshotPath.
+            // More relevant if a system message is a direct consequence of an action that *did* set the path.
+            // However, explicit command result handling is better for those.
+            // imagePathForInitialMessage = this.latestScreenshotPath; // Generally, system messages here won't have an image unless specific logic sets it.
+        }
+
+
+        await this.history.add(source, message, imagePathForInitialMessage);
+        if (imagePathForInitialMessage) {
+            this.latestScreenshotPath = null; // Consume the path if used
+        }
        this.history.save();

        if (!self_prompt && this.self_prompter.isActive()) // message is from user during self-prompting
@ -306,10 +354,12 @@ export class Agent {

            if (command_name) { // contains query or command
                res = truncCommandMessage(res); // everything after the command is ignored
-                this.history.add(this.name, res);
+                // Agent's own message stating the command it will execute
+                await this.history.add(this.name, res, null);
                
                if (!commandExists(command_name)) {
-                    this.history.add('system', `Command ${command_name} does not exist.`);
+                    // Agent hallucinated a command
+                    await this.history.add('system', `Command ${command_name} does not exist.`, null);
                    console.warn('Agent hallucinated command:', command_name)
                    continue;
                }
@ -333,13 +383,24 @@ export class Agent {
                console.log('Agent executed:', command_name, 'and got:', execute_res);
                used_command = true;

-                if (execute_res)
-                    this.history.add('system', execute_res);
-                else
+                if (execute_res) {
+                    let imagePathForCommandResult = null;
+                    // Vision commands (!lookAtPlayer, !lookAtPosition) set latestScreenshotPath in VisionInterpreter.
+                    // This is relevant if mode is 'on' (analysis done, path stored by VI) or 'always_active' (screenshot taken, path stored by VI).
+                    if (command_name && (command_name === '!lookAtPlayer' || command_name === '!lookAtPosition') && this.latestScreenshotPath) {
+                        imagePathForCommandResult = this.latestScreenshotPath;
+                    }
+                    await this.history.add('system', execute_res, imagePathForCommandResult);
+                    if (imagePathForCommandResult) {
+                        this.latestScreenshotPath = null; // Consume the path
+                    }
+                }
+                else { // command execution didn't return anything or failed in a way that implies loop break
                    break;
                }
-            else { // conversation response
-                this.history.add(this.name, res);
+            }
+            else { // conversation response (no command)
+                await this.history.add(this.name, res, null); // Agent's text response, no image typically
                this.routeResponse(source, res);
                break;
            }
@ -488,7 +549,8 @@ export class Agent {
    

    cleanKill(msg='Killing agent process...', code=1) {
-        this.history.add('system', msg);
+        // Assuming cleanKill messages don't have images
+        await this.history.add('system', msg, null);
        this.bot.chat(code > 1 ? 'Restarting.': 'Exiting.');
        this.history.save();
        process.exit(code);
@ -497,7 +559,8 @@ export class Agent {
        if (this.task.data) {
            let res = this.task.isDone();
            if (res) {
-                await this.history.add('system', `Task ended with score : ${res.score}`);
+                // Assuming task end messages don't have images
+                await this.history.add('system', `Task ended with score : ${res.score}`, null);
                await this.history.save();
                // await new Promise(resolve => setTimeout(resolve, 3000)); // Wait 3 second for save to complete
                console.log('Task finished:', res.message);
--- a/src/agent/commands/actions.js
+++ b/src/agent/commands/actions.js
@ -428,6 +428,13 @@ export const actionsList = [
            }
        },
        perform: async function(agent, player_name, direction) {
+            if (agent.vision_interpreter && agent.vision_interpreter.vision_mode === 'off') {
+                return "Vision commands are disabled as vision mode is 'off'.";
+            }
+            // Also check if vision_interpreter or camera is not available if mode is not 'off'
+            if (agent.vision_interpreter && !agent.vision_interpreter.camera && agent.vision_interpreter.vision_mode !== 'off') {
+                return "Camera is not available, cannot perform look command.";
+            }
            if (direction !== 'at' && direction !== 'with') {
                return "Invalid direction. Use 'at' or 'with'.";
            }
@ -448,6 +455,13 @@ export const actionsList = [
            'z': { type: 'int', description: 'z coordinate' }
        },
        perform: async function(agent, x, y, z) {
+            if (agent.vision_interpreter && agent.vision_interpreter.vision_mode === 'off') {
+                return "Vision commands are disabled as vision mode is 'off'.";
+            }
+            // Also check if vision_interpreter or camera is not available if mode is not 'off'
+            if (agent.vision_interpreter && !agent.vision_interpreter.camera && agent.vision_interpreter.vision_mode !== 'off') {
+                return "Camera is not available, cannot perform look command.";
+            }
            let result = "";
            const actionFn = async () => {
                result = await agent.vision_interpreter.lookAtPosition(x, y, z);
--- a/src/agent/history.js
+++ b/src/agent/history.js
@ -58,7 +58,7 @@ export class History {
        }
    }

-    async add(name, content) {
+    async add(name, content, imagePath = null) {
        let role = 'assistant';
        if (name === 'system') {
            role = 'system';
@ -67,7 +67,7 @@ export class History {
            role = 'user';
            content = `${name}: ${content}`;
        }
-        this.turns.push({role, content});
+        this.turns.push({role, content, imagePath});

        if (this.turns.length >= this.max_messages) {
            let chunk = this.turns.splice(0, this.summary_chunk_size);
--- a/src/agent/vision/camera.js
+++ b/src/agent/vision/camera.js
@ -60,8 +60,8 @@ export class Camera extends EventEmitter {
        const buf = await getBufferFromStream(imageStream);
        await this._ensureScreenshotDirectory();
        await fs.writeFile(`${this.fp}/${filename}.jpg`, buf);
-        console.log('saved', filename);
-        return filename;
+        console.log('saved', filename + '.jpg');
+        return filename + '.jpg';
    }

    async _ensureScreenshotDirectory() {
--- a/src/agent/vision/vision_interpreter.js
+++ b/src/agent/vision/vision_interpreter.js
@ -1,21 +1,29 @@
 import { Vec3 } from 'vec3';
 import { Camera } from "./camera.js";
 import fs from 'fs';
+import path from 'path';

 export class VisionInterpreter {
-    constructor(agent, allow_vision) {
+    constructor(agent, vision_mode) {
        this.agent = agent;
-        this.allow_vision = allow_vision;
+        this.vision_mode = vision_mode;
        this.fp = './bots/'+agent.name+'/screenshots/';
-        if (allow_vision) {
+        if (this.vision_mode !== 'off') {
            this.camera = new Camera(agent.bot, this.fp);
        }
    }

    async lookAtPlayer(player_name, direction) {
-        if (!this.allow_vision || !this.agent.prompter.vision_model.sendVisionRequest) {
+        if (this.vision_mode === 'off') {
            return "Vision is disabled. Use other methods to describe the environment.";
        }
+        if (!this.camera) {
+            return "Camera is not initialized. Vision may be set to 'off'.";
+        }
+        if (!this.agent.prompter.vision_model.sendVisionRequest && this.vision_mode === 'prompted') {
+            return "Vision requests are not enabled for the current model. Cannot analyze image.";
+        }
+
        let result = "";
        const bot = this.agent.bot;
        const player = bot.players[player_name]?.entity;
@ -26,30 +34,51 @@ export class VisionInterpreter {
        let filename;
        if (direction === 'with') {
            await bot.look(player.yaw, player.pitch);
-            result = `Looking in the same direction as ${player_name}\n`;
+            result = `Looking in the same direction as ${player_name}.\n`;
            filename = await this.camera.capture();
+            this.agent.latestScreenshotPath = filename;
        } else {
            await bot.lookAt(new Vec3(player.position.x, player.position.y + player.height, player.position.z));
-            result = `Looking at player ${player_name}\n`;
+            result = `Looking at player ${player_name}.\n`;
            filename = await this.camera.capture();
-
+            this.agent.latestScreenshotPath = filename;
        }

+        if (this.vision_mode === 'prompted') {
            return result + `Image analysis: "${await this.analyzeImage(filename)}"`;
+        } else if (this.vision_mode === 'always') {
+            return result + "Screenshot taken and stored.";
+        }
+        // Should not be reached if vision_mode is one of the expected values
+        return "Error: Unknown vision mode.";
    }

    async lookAtPosition(x, y, z) {
-        if (!this.allow_vision || !this.agent.prompter.vision_model.sendVisionRequest) {
+        if (this.vision_mode === 'off') {
            return "Vision is disabled. Use other methods to describe the environment.";
        }
+        if (!this.camera) {
+            return "Camera is not initialized. Vision may be set to 'off'.";
+        }
+        if (!this.agent.prompter.vision_model.sendVisionRequest && this.vision_mode === 'prompted') {
+            return "Vision requests are not enabled for the current model. Cannot analyze image.";
+        }
+
        let result = "";
        const bot = this.agent.bot;
-        await bot.lookAt(new Vec3(x, y + 2, z));
-        result = `Looking at coordinate ${x}, ${y}, ${z}\n`;
+        await bot.lookAt(new Vec3(x, y + 2, z)); // lookAt requires y to be eye level, so +2 from feet
+        result = `Looking at coordinate ${x}, ${y}, ${z}.\n`;

        let filename = await this.camera.capture();
+        this.agent.latestScreenshotPath = filename;

+        if (this.vision_mode === 'prompted') {
            return result + `Image analysis: "${await this.analyzeImage(filename)}"`;
+        } else if (this.vision_mode === 'always') {
+            return result + "Screenshot taken and stored.";
+        }
+        // Should not be reached if vision_mode is one of the expected values
+        return "Error: Unknown vision mode.";
    }

    getCenterBlockInfo() {
@ -66,7 +95,9 @@ export class VisionInterpreter {

    async analyzeImage(filename) {
        try {
-            const imageBuffer = fs.readFileSync(`${this.fp}/${filename}.jpg`);
+            // filename already includes .jpg from camera.js
+            const imageFullPath = path.join(this.fp, filename);
+            const imageBuffer = fs.readFileSync(imageFullPath);
            const messages = this.agent.history.getHistory();

            const blockInfo = this.getCenterBlockInfo();
--- a/src/models/claude.js
+++ b/src/models/claude.js
@ -1,43 +1,86 @@
 import Anthropic from '@anthropic-ai/sdk';
 import { strictFormat } from '../utils/text.js';
 import { getKey } from '../utils/keys.js';
+import { log, logVision } from '../../logger.js';

 export class Claude {
    constructor(model_name, url, params) {
        this.model_name = model_name;
        this.params = params || {};
-
        let config = {};
        if (url)
            config.baseURL = url;
-        
        config.apiKey = getKey('ANTHROPIC_API_KEY');
-
        this.anthropic = new Anthropic(config);
+        this.supportsRawImageInput = true;
    }

-    async sendRequest(turns, systemMessage) {
-        const messages = strictFormat(turns);
+    async sendRequest(turns, systemMessage, imageData = null) {
+        const messages = strictFormat(turns); // Ensure messages are in role/content format
        let res = null;
+
+        if (imageData) {
+            const visionModels = ["claude-3-opus-20240229", "claude-3-sonnet-20240229", "claude-3-haiku-20240307"];
+            if (!visionModels.some(vm => this.model_name.includes(vm))) {
+                console.warn(`[Claude] Warning: imageData provided for model ${this.model_name}, which is not explicitly a Claude 3 vision model. The image may be ignored or cause an error.`);
+            }
+
+            let lastUserMessageIndex = -1;
+            for (let i = messages.length - 1; i >= 0; i--) {
+                if (messages[i].role === 'user') {
+                    lastUserMessageIndex = i;
+                    break;
+                }
+            }
+
+            if (lastUserMessageIndex !== -1) {
+                const userMessage = messages[lastUserMessageIndex];
+                const imagePart = {
+                    type: "image",
+                    source: {
+                        type: "base64",
+                        media_type: "image/jpeg", // Assuming JPEG
+                        data: imageData.toString('base64')
+                    }
+                };
+
+                if (typeof userMessage.content === 'string') {
+                    userMessage.content = [{ type: "text", text: userMessage.content }, imagePart];
+                } else if (Array.isArray(userMessage.content)) {
+                    // If content is already an array, add the image part.
+                    // This handles cases where a user message might already have multiple parts (e.g. multiple text parts, though less common for this bot).
+                    userMessage.content.push(imagePart);
+                } else {
+                     // Fallback or error if content is an unexpected type
+                    console.warn('[Claude] Last user message content is not a string or array. Cannot attach image.');
+                    userMessage.content = [imagePart]; // Or create a new message with just the image if appropriate
+                }
+            } else {
+                console.warn('[Claude] imageData provided, but no user message found to attach it to. Image not sent.');
+                // Optionally, could create a new user message with the image if that's desired behavior.
+                // messages.push({ role: 'user', content: [imagePart] });
+            }
+        }
+
        try {
-            console.log('Awaiting anthropic api response...')
+            console.log('Awaiting anthropic api response...');
+            // console.log('Formatted Messages for API:', JSON.stringify(messages, null, 2));
+            // console.log('System prompt for API:', systemMessage);
+
            if (!this.params.max_tokens) {
                if (this.params.thinking?.budget_tokens) {
-                    this.params.max_tokens = this.params.thinking.budget_tokens + 1000;
-                    // max_tokens must be greater than thinking.budget_tokens
+                    this.params.max_tokens = this.params.thinking.budget_tokens + 1000; // max_tokens must be greater
                } else {
                    this.params.max_tokens = 4096;
                }
            }
            const resp = await this.anthropic.messages.create({
-                model: this.model_name || "claude-3-sonnet-20240229",
+                model: this.model_name || "claude-3-sonnet-20240229", // Default to a vision-capable model if none specified
                system: systemMessage,
-                messages: messages,
+                messages: messages, // messages array is now potentially modified with image data
                ...(this.params || {})
            });
-
            console.log('Received.')
-            // get first content of type text
            const textContent = resp.content.find(content => content.type === 'text');
            if (textContent) {
                res = textContent.text;
@ -45,8 +88,7 @@ export class Claude {
                console.warn('No text content found in the response.');
                res = 'No response from Claude.';
            }
-        }
-        catch (err) {
+        } catch (err) {
            if (err.message.includes("does not support image input")) {
                res = "Vision is only supported by certain models.";
            } else {
@ -54,18 +96,17 @@ export class Claude {
            }
            console.log(err);
        }
+        const logMessagesForClaude = [{ role: "system", content: systemMessage }].concat(turns);
+        if (typeof res === 'string') {
+            res = res.replace(/<thinking>/g, '<think>').replace(/<\/thinking>/g, '</think>');
+        }
+        log(JSON.stringify(logMessagesForClaude), res);
        return res;
    }

    async sendVisionRequest(turns, systemMessage, imageBuffer) {
-        const imageMessages = [...turns];
-        imageMessages.push({
-            role: "user",
-            content: [
-                {
-                    type: "text",
-                    text: systemMessage
-                },
+        const visionUserMessageContent = [
+            { type: "text", text: systemMessage },
            {
                type: "image",
                source: {
@ -74,10 +115,15 @@ export class Claude {
                    data: imageBuffer.toString('base64')
                }
            }
-            ]
-        });
+        ];
+        const turnsForAPIRequest = [...turns, { role: "user", content: visionUserMessageContent }];

-        return this.sendRequest(imageMessages, systemMessage);
+        const res = await this.sendRequest(turnsForAPIRequest, systemMessage);
+
+        if (imageBuffer && res) {
+            logVision(turns, imageBuffer, res, systemMessage);
+        }
+        return res;
    }

    async embed(text) {
--- a/src/models/deepseek.js
+++ b/src/models/deepseek.js
@ -1,43 +1,92 @@
 import OpenAIApi from 'openai';
 import { getKey, hasKey } from '../utils/keys.js';
 import { strictFormat } from '../utils/text.js';
+import { log, logVision } from '../../logger.js';

 export class DeepSeek {
    constructor(model_name, url, params) {
        this.model_name = model_name;
        this.params = params;
-
        let config = {};
-
        config.baseURL = url || 'https://api.deepseek.com';
        config.apiKey = getKey('DEEPSEEK_API_KEY');
-
        this.openai = new OpenAIApi(config);
+        this.supportsRawImageInput = true; // Assuming DeepSeek models used can support this OpenAI-like format
    }

-    async sendRequest(turns, systemMessage, stop_seq='***') {
+    async sendRequest(turns, systemMessage, imageData = null, stop_seq = '***') {
        let messages = [{'role': 'system', 'content': systemMessage}].concat(turns);
-
        messages = strictFormat(messages);

+        if (imageData) {
+            console.warn(`[DeepSeek] imageData provided. Ensure the configured DeepSeek model ('${this.model_name || "deepseek-chat"}') is vision-capable.`);
+
+            let lastUserMessageIndex = -1;
+            for (let i = messages.length - 1; i >= 0; i--) {
+                if (messages[i].role === 'user') {
+                    lastUserMessageIndex = i;
+                    break;
+                }
+            }
+
+            if (lastUserMessageIndex !== -1) {
+                const userMessage = messages[lastUserMessageIndex];
+                const originalContent = userMessage.content; // Should be a string
+
+                if (typeof originalContent === 'string') {
+                    userMessage.content = [
+                        { type: "text", text: originalContent },
+                        {
+                            type: "image_url",
+                            image_url: {
+                                url: `data:image/jpeg;base64,${imageData.toString('base64')}`
+                            }
+                        }
+                    ];
+                } else {
+                    // If content is already an array (e.g. from a previous modification or different source)
+                    // We'd need a more robust way to handle this, but for now, assume it's a string
+                    // or log an error/warning.
+                    console.warn('[DeepSeek] Last user message content was not a simple string. Attempting to add image, but structure might be unexpected.');
+                    if(Array.isArray(originalContent)) {
+                        originalContent.push({
+                            type: "image_url",
+                            image_url: { url: `data:image/jpeg;base64,${imageData.toString('base64')}` }
+                        });
+                        userMessage.content = originalContent;
+                    } else { // Fallback if it's some other type, just overwrite with new structure
+                         userMessage.content = [
+                            { type: "text", text: String(originalContent) }, // Attempt to stringify
+                            {
+                                type: "image_url",
+                                image_url: { url: `data:image/jpeg;base64,${imageData.toString('base64')}` }
+                            }
+                        ];
+                    }
+                }
+            } else {
+                console.warn('[DeepSeek] imageData provided, but no user message found to attach it to. Image not sent.');
+                // Or: messages.push({ role: 'user', content: [ { type: "image_url", image_url: { url: ... } } ] });
+            }
+        }
+
        const pack = {
            model: this.model_name || "deepseek-chat",
            messages,
            stop: stop_seq,
            ...(this.params || {})
        };
-
        let res = null;
        try {
+          
            console.log('Awaiting deepseek api response...')
-            // console.log('Messages:', messages);
+          
            let completion = await this.openai.chat.completions.create(pack);
            if (completion.choices[0].finish_reason == 'length')
                throw new Error('Context length exceeded');
-            console.log('Received.')
+            console.log('Received.');
            res = completion.choices[0].message.content;
-        }
-        catch (err) {
+        } catch (err) {
            if ((err.message == 'Context length exceeded' || err.code == 'context_length_exceeded') && turns.length > 1) {
                console.log('Context length exceeded, trying again with shorter context.');
                return await this.sendRequest(turns.slice(1), systemMessage, stop_seq);
@ -46,6 +95,10 @@ export class DeepSeek {
                res = 'My brain disconnected, try again.';
            }
        }
+        if (typeof res === 'string') {
+            res = res.replace(/<thinking>/g, '<think>').replace(/<\/thinking>/g, '</think>');
+        }
+        log(JSON.stringify(messages), res);
        return res;
    }

@ -53,6 +106,3 @@ export class DeepSeek {
        throw new Error('Embeddings are not supported by Deepseek.');
    }
 }
-
-
-
--- a/src/models/gemini.js
+++ b/src/models/gemini.js
@ -1,6 +1,7 @@
 import { GoogleGenerativeAI } from '@google/generative-ai';
 import { toSinglePrompt, strictFormat } from '../utils/text.js';
 import { getKey } from '../utils/keys.js';
+import { log, logVision } from '../../logger.js';

 export class Gemini {
    constructor(model_name, url, params) {
@ -8,52 +9,29 @@ export class Gemini {
        this.params = params;
        this.url = url;
        this.safetySettings = [
-            {
-                "category": "HARM_CATEGORY_DANGEROUS",
-                "threshold": "BLOCK_NONE",
-            },
-            {
-                "category": "HARM_CATEGORY_HARASSMENT",
-                "threshold": "BLOCK_NONE",
-            },
-            {
-                "category": "HARM_CATEGORY_HATE_SPEECH",
-                "threshold": "BLOCK_NONE",
-            },
-            {
-                "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
-                "threshold": "BLOCK_NONE",
-            },
-            {
-                "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
-                "threshold": "BLOCK_NONE",
-            },
+            { "category": "HARM_CATEGORY_DANGEROUS", "threshold": "BLOCK_NONE" },
+            { "category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE" },
+            { "category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE" },
+            { "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE" },
+            { "category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE" },
        ];
-
        this.genAI = new GoogleGenerativeAI(getKey('GEMINI_API_KEY'));
+        this.supportsRawImageInput = true;
    }

-    async sendRequest(turns, systemMessage) {
+    async sendRequest(turns, systemMessage, imageData = null) {
        let model;
        const modelConfig = {
            model: this.model_name || "gemini-1.5-flash",
            // systemInstruction does not work bc google is trash
        };
        if (this.url) {
-            model = this.genAI.getGenerativeModel(
-                modelConfig,
-                { baseUrl: this.url },
-                { safetySettings: this.safetySettings }
-            );
+            model = this.genAI.getGenerativeModel(modelConfig, { baseUrl: this.url }, { safetySettings: this.safetySettings });
        } else {
-            model = this.genAI.getGenerativeModel(
-                modelConfig,
-                { safetySettings: this.safetySettings }
-            );
+            model = this.genAI.getGenerativeModel(modelConfig, { safetySettings: this.safetySettings });
        }
-
        console.log('Awaiting Google API response...');
-
+        const originalTurnsForLog = [{role: 'system', content: systemMessage}, ...turns];
        turns.unshift({ role: 'system', content: systemMessage });
        turns = strictFormat(turns);
        let contents = [];
@ -64,24 +42,32 @@ export class Gemini {
            });
        }

+        if (imageData && contents.length > 0) {
+            const lastContent = contents[contents.length - 1];
+            if (lastContent.role === 'user') { // Ensure the image is added to a user turn
+                lastContent.parts.push({
+                    inline_data: {
+                        mime_type: 'image/jpeg',
+                        data: imageData.toString('base64')
+                    }
+                });
+            } else {
+                // This case should ideally not happen if imageData is tied to a user message.
+                // If it does, we could append a new user turn with the image,
+                // or log a warning and send without the image.
+                // For now, let's assume the last message is the user's if imageData is present.
+                console.warn('[Gemini] imageData provided, but the last content entry was not from a user. Image not sent.');
+            }
+        }
+
        const result = await model.generateContent({
            contents,
-            generationConfig: {
-                ...(this.params || {})
-            }
+            generationConfig: { ...(this.params || {}) }
        });
        const response = await result.response;
        let text;
-
-        // Handle "thinking" models since they smart 
        if (this.model_name && this.model_name.includes("thinking")) {
-            if (
-                response.candidates &&
-                response.candidates.length > 0 &&
-                response.candidates[0].content &&
-                response.candidates[0].content.parts &&
-                response.candidates[0].content.parts.length > 1
-            ) {
+            if (response.candidates?.length > 0 && response.candidates[0].content?.parts?.length > 1) {
                text = response.candidates[0].content.parts[1].text;
            } else {
                console.warn("Unexpected response structure for thinking model:", response);
@ -90,34 +76,22 @@ export class Gemini {
        } else {
            text = response.text();
        }
-
        console.log('Received.');
-
+        if (typeof text === 'string') {
+            text = text.replace(/<thinking>/g, '<think>').replace(/<\/thinking>/g, '</think>');
+        }
+        log(JSON.stringify(originalTurnsForLog), text);
        return text;
    }

    async sendVisionRequest(turns, systemMessage, imageBuffer) {
        let model;
        if (this.url) {
-            model = this.genAI.getGenerativeModel(
-                { model: this.model_name || "gemini-1.5-flash" },
-                { baseUrl: this.url },
-                { safetySettings: this.safetySettings }
-            );
+            model = this.genAI.getGenerativeModel({ model: this.model_name || "gemini-1.5-flash" }, { baseUrl: this.url }, { safetySettings: this.safetySettings });
        } else {
-            model = this.genAI.getGenerativeModel(
-                { model: this.model_name || "gemini-1.5-flash" },
-                { safetySettings: this.safetySettings }
-            );
+            model = this.genAI.getGenerativeModel({ model: this.model_name || "gemini-1.5-flash" }, { safetySettings: this.safetySettings });
        }
-
-        const imagePart = {
-            inlineData: {
-                data: imageBuffer.toString('base64'),
-                mimeType: 'image/jpeg'
-            }
-        };
-
+        const imagePart = { inlineData: { data: imageBuffer.toString('base64'), mimeType: 'image/jpeg' } };
        const stop_seq = '***';
        const prompt = toSinglePrompt(turns, systemMessage, stop_seq, 'model');
        let res = null;
@ -127,6 +101,9 @@ export class Gemini {
            const response = await result.response;
            const text = response.text();
            console.log('Received.');
+            if (imageBuffer && text) {
+                logVision(turns, imageBuffer, text, prompt);
+            }
            if (!text.includes(stop_seq)) return text;
            const idx = text.indexOf(stop_seq);
            res = text.slice(0, idx);
@ -137,6 +114,11 @@ export class Gemini {
            } else {
                res = "An unexpected error occurred, please try again.";
            }
+            const loggedTurnsForError = [{role: 'system', content: systemMessage}, ...turns];
+            if (typeof res === 'string') {
+                res = res.replace(/<thinking>/g, '<think>').replace(/<\/thinking>/g, '</think>');
+            }
+            log(JSON.stringify(loggedTurnsForError), res);
        }
        return res;
    }
@ -144,16 +126,10 @@ export class Gemini {
    async embed(text) {
        let model;
        if (this.url) {
-            model = this.genAI.getGenerativeModel(
-                { model: "text-embedding-004" },
-                { baseUrl: this.url }
-            );
+            model = this.genAI.getGenerativeModel({ model: "text-embedding-004" }, { baseUrl: this.url });
        } else {
-            model = this.genAI.getGenerativeModel(
-                { model: "text-embedding-004" }
-            );
+            model = this.genAI.getGenerativeModel({ model: "text-embedding-004" });
        }
-
        const result = await model.embedContent(text);
        return result.embedding.values;
    }
--- a/src/models/glhf.js
+++ b/src/models/glhf.js
@ -1,5 +1,6 @@
 import OpenAIApi from 'openai';
 import { getKey } from '../utils/keys.js';
+import { log, logVision } from '../../logger.js';

 export class GLHF {
    constructor(model_name, url) {
@ -12,9 +13,15 @@ export class GLHF {
            apiKey,
            baseURL: url || "https://glhf.chat/api/openai/v1"
        });
+        // Direct image data in sendRequest is not supported by this wrapper.
+        // Specific vision models/methods should be used if available through the service.
+        this.supportsRawImageInput = false;
    }

-    async sendRequest(turns, systemMessage, stop_seq = '***') {
+    async sendRequest(turns, systemMessage, imageData = null, stop_seq = '***') {
+        if (imageData) {
+            console.warn(`[GLHF] Warning: imageData provided to sendRequest, but this method in glhf.js does not support direct image data embedding for model ${this.model_name}. The image will be ignored.`);
+        }
        // Construct the message array for the API request.
        let messages = [{ role: 'system', content: systemMessage }].concat(turns);
        const pack = {
@ -42,15 +49,18 @@ export class GLHF {
                    continue;
                }
                // If there's a closing </think> tag but no opening <think>, prepend one.
+
                if (res.includes("</think>") && !res.includes("<think>")) {
                    res = "<think>" + res;
                }
                finalRes = res.replace(/<\|separator\|>/g, '*no response*');
                break; // Valid response obtained.
+              
            } catch (err) {
                if ((err.message === 'Context length exceeded' || err.code === 'context_length_exceeded') && turns.length > 1) {
                    console.log('Context length exceeded, trying again with shorter context.');
-                    return await this.sendRequest(turns.slice(1), systemMessage, stop_seq);
+                    // Pass imageData along in recursive call, though it will be ignored again
+                    return await this.sendRequest(turns.slice(1), systemMessage, imageData, stop_seq);
                } else {
                    console.error(err);
                    finalRes = 'My brain disconnected, try again.';
@ -61,6 +71,11 @@ export class GLHF {
        if (finalRes === null) {
            finalRes = "I thought too hard, sorry, try again";
        }
+
+        if (typeof finalRes === 'string') {
+            finalRes = finalRes.replace(/<thinking>/g, '<think>').replace(/<\/thinking>/g, '</think>');
+        }
+        log(JSON.stringify(messages), finalRes);
        return finalRes;
    }

--- a/src/models/gpt.js
+++ b/src/models/gpt.js
@ -1,27 +1,58 @@
 import OpenAIApi from 'openai';
 import { getKey, hasKey } from '../utils/keys.js';
 import { strictFormat } from '../utils/text.js';
+import { log, logVision } from '../../logger.js';

 export class GPT {
    constructor(model_name, url, params) {
        this.model_name = model_name;
        this.params = params;
-
        let config = {};
        if (url)
            config.baseURL = url;
-
        if (hasKey('OPENAI_ORG_ID'))
            config.organization = getKey('OPENAI_ORG_ID');
-
        config.apiKey = getKey('OPENAI_API_KEY');
-
        this.openai = new OpenAIApi(config);
+        this.supportsRawImageInput = true;
    }

-    async sendRequest(turns, systemMessage, stop_seq='***') {
+    async sendRequest(turns, systemMessage, imageData = null, stop_seq = '***') {
        let messages = [{'role': 'system', 'content': systemMessage}].concat(turns);
        messages = strictFormat(messages);
+
+        if (imageData) {
+            const visionModels = ["gpt-4-vision-preview", "gpt-4o", "gpt-4-turbo"];
+            if (!visionModels.some(vm => this.model_name.includes(vm))) {
+                console.warn(`[GPT] Warning: imageData provided for model ${this.model_name}, which is not explicitly a vision model. The image may be ignored or cause an error.`);
+            }
+
+            let lastUserMessageIndex = -1;
+            for (let i = messages.length - 1; i >= 0; i--) {
+                if (messages[i].role === 'user') {
+                    lastUserMessageIndex = i;
+                    break;
+                }
+            }
+
+            if (lastUserMessageIndex !== -1) {
+                const originalContent = messages[lastUserMessageIndex].content;
+                messages[lastUserMessageIndex].content = [
+                    { type: "text", text: originalContent },
+                    {
+                        type: "image_url",
+                        image_url: {
+                            url: `data:image/jpeg;base64,${imageData.toString('base64')}`
+                        }
+                    }
+                ];
+            } else {
+                // No user message to attach image to, log warning or prepend a new one?
+                // For now, log a warning. Prompter should ensure user message exists if imagePath is set.
+                console.warn('[GPT] imageData provided, but no user message found to attach it to. Image not sent.');
+            }
+        }
+
        const pack = {
            model: this.model_name || "gpt-3.5-turbo",
            messages,
@ -31,19 +62,17 @@ export class GPT {
        if (this.model_name.includes('o1')) {
            delete pack.stop;
        }
-
        let res = null;
-
        try {
-            console.log('Awaiting openai api response from model', this.model_name)
-            // console.log('Messages:', messages);
+
+            console.log('Awaiting openai api response from model', this.model_name);
+
            let completion = await this.openai.chat.completions.create(pack);
            if (completion.choices[0].finish_reason == 'length')
                throw new Error('Context length exceeded');
-            console.log('Received.')
+            console.log('Received.');
            res = completion.choices[0].message.content;
-        }
-        catch (err) {
+        } catch (err) {
            if ((err.message == 'Context length exceeded' || err.code == 'context_length_exceeded') && turns.length > 1) {
                console.log('Context length exceeded, trying again with shorter context.');
                return await this.sendRequest(turns.slice(1), systemMessage, stop_seq);
@ -55,25 +84,32 @@ export class GPT {
                res = 'My brain disconnected, try again.';
            }
        }
+        if (typeof res === 'string') {
+            res = res.replace(/<thinking>/g, '<think>').replace(/<\/thinking>/g, '</think>');
+        }
+        log(JSON.stringify(messages), res);
        return res;
    }

-    async sendVisionRequest(messages, systemMessage, imageBuffer) {
-        const imageMessages = [...messages];
-        imageMessages.push({
+    async sendVisionRequest(original_turns, systemMessage, imageBuffer) {
+        const imageFormattedTurns = [...original_turns];
+        imageFormattedTurns.push({
            role: "user",
            content: [
                { type: "text", text: systemMessage },
                {
                    type: "image_url",
-                    image_url: {
-                        url: `data:image/jpeg;base64,${imageBuffer.toString('base64')}`
-                    }
+                    image_url: { url: `data:image/jpeg;base64,${imageBuffer.toString('base64')}` }
                }
            ]
        });
        
-        return this.sendRequest(imageMessages, systemMessage);
+        const res = await this.sendRequest(imageFormattedTurns, systemMessage);
+
+        if (imageBuffer && res) {
+            logVision(original_turns, imageBuffer, res, systemMessage);
+        }
+        return res;
    }

    async embed(text) {
@ -86,8 +122,4 @@ export class GPT {
        });
        return embedding.data[0].embedding;
    }
-
 }
-
-
-
--- a/src/models/grok.js
+++ b/src/models/grok.js
@ -1,5 +1,6 @@
 import OpenAIApi from 'openai';
 import { getKey } from '../utils/keys.js';
+import { log, logVision } from '../../logger.js';

 // xAI doesn't supply a SDK for their models, but fully supports OpenAI and Anthropic SDKs
 export class Grok {
@ -7,42 +8,41 @@ export class Grok {
        this.model_name = model_name;
        this.url = url;
        this.params = params;
-
        let config = {};
        if (url)
            config.baseURL = url;
        else
            config.baseURL = "https://api.x.ai/v1"
-
        config.apiKey = getKey('XAI_API_KEY');
-
        this.openai = new OpenAIApi(config);
+        // Direct image data in sendRequest is not supported by this wrapper for standard chat.
+        // Grok may have specific vision capabilities, but this method assumes text-only.
+        this.supportsRawImageInput = false;
    }

-    async sendRequest(turns, systemMessage, stop_seq='***') {
+    async sendRequest(turns, systemMessage, imageData = null, stop_seq='***') {
+        if (imageData) {
+            console.warn(`[Grok] Warning: imageData provided to sendRequest, but this method in grok.js does not support direct image data embedding for model ${this.model_name}. The image will be ignored.`);
+        }
        let messages = [{'role': 'system', 'content': systemMessage}].concat(turns);
-
        const pack = {
            model: this.model_name || "grok-beta",
            messages,
            stop: [stop_seq],
            ...(this.params || {})
        };
-
        let res = null;
        try {
            console.log('Awaiting xai api response...')
-            ///console.log('Messages:', messages);
            let completion = await this.openai.chat.completions.create(pack);
            if (completion.choices[0].finish_reason == 'length')
                throw new Error('Context length exceeded'); 
            console.log('Received.')
            res = completion.choices[0].message.content;
-        }
-        catch (err) {
+        } catch (err) {
            if ((err.message == 'Context length exceeded' || err.code == 'context_length_exceeded') && turns.length > 1) {
                console.log('Context length exceeded, trying again with shorter context.');
-                return await this.sendRequest(turns.slice(1), systemMessage, stop_seq);
+                return await this.sendRequest(turns.slice(1), systemMessage, imageData, stop_seq);
            } else if (err.message.includes('The model expects a single `text` element per message.')) {
                console.log(err);
                res = 'Vision is only supported by certain models.';
@ -52,31 +52,36 @@ export class Grok {
            }
        }
        // sometimes outputs special token <|separator|>, just replace it
-        return res.replace(/<\|separator\|>/g, '*no response*');
+        let finalResponseText = res ? res.replace(/<\|separator\|>/g, '*no response*') : (res === null ? "*no response*" : res);
+        if (typeof finalResponseText === 'string') {
+            finalResponseText = finalResponseText.replace(/<thinking>/g, '<think>').replace(/<\/thinking>/g, '</think>');
+        }
+        log(JSON.stringify(messages), finalResponseText);
+        return finalResponseText;
    }

-    async sendVisionRequest(messages, systemMessage, imageBuffer) {
-        const imageMessages = [...messages];
-        imageMessages.push({
+    async sendVisionRequest(original_turns, systemMessage, imageBuffer) {
+        const imageFormattedTurns = [...original_turns];
+        imageFormattedTurns.push({
            role: "user",
            content: [
                { type: "text", text: systemMessage },
                {
                    type: "image_url",
-                    image_url: {
-                        url: `data:image/jpeg;base64,${imageBuffer.toString('base64')}`
-                    }
+                    image_url: { url: `data:image/jpeg;base64,${imageBuffer.toString('base64')}` }
                }
            ]
        });
        
-        return this.sendRequest(imageMessages, systemMessage);
+        const res = await this.sendRequest(imageFormattedTurns, systemMessage);
+
+        if (imageBuffer && res) {
+            logVision(original_turns, imageBuffer, res, systemMessage);
+        }
+        return res;
    }
    
    async embed(text) {
        throw new Error('Embeddings are not supported by Grok.');
    }
 }
-
-
-
--- a/src/models/groq.js
+++ b/src/models/groq.js
@ -1,14 +1,14 @@
 import Groq from 'groq-sdk'
+import fs from "fs";
 import { getKey } from '../utils/keys.js';
+import { log, logVision } from '../../logger.js';

 // THIS API IS NOT TO BE CONFUSED WITH GROK!
 // Go to grok.js for that. :)

 // Umbrella class for everything under the sun... That GroqCloud provides, that is.
 export class GroqCloudAPI {
-
    constructor(model_name, url, params) {
-
        this.model_name = model_name;
        this.url = url;
        this.params = params || {};
@ -18,21 +18,23 @@ export class GroqCloudAPI {
            delete this.params.tools;
        // This is just a bit of future-proofing in case we drag Mindcraft in that direction.

-        // I'm going to do a sneaky ReplicateAPI theft for a lot of this, aren't I?
        if (this.url)
            console.warn("Groq Cloud has no implementation for custom URLs. Ignoring provided URL.");

        this.groq = new Groq({ apiKey: getKey('GROQCLOUD_API_KEY') });
-
+        // Direct image data in sendRequest is not supported by this wrapper.
+        // Groq may offer specific vision models/APIs, but this standard chat method assumes text.
+        this.supportsRawImageInput = false;

    }

-    async sendRequest(turns, systemMessage, stop_seq = null) {
+    async sendRequest(turns, systemMessage, imageData = null, stop_seq = null) {
+        if (imageData) {
+            console.warn(`[Groq] Warning: imageData provided to sendRequest, but this method in groq.js does not support direct image data embedding for model ${this.model_name}. The image will be ignored.`);
+        }
        // Construct messages array
        let messages = [{"role": "system", "content": systemMessage}].concat(turns);
-
        let res = null;
-
        try {
            console.log("Awaiting Groq response...");

@ -42,7 +44,6 @@ export class GroqCloudAPI {
                this.params.max_completion_tokens = this.params.max_tokens;
                delete this.params.max_tokens;
            }
-
            if (!this.params.max_completion_tokens) {
                this.params.max_completion_tokens = 4000;
            }
@ -55,11 +56,15 @@ export class GroqCloudAPI {
                ...(this.params || {})
            });

-            res = completion.choices[0].message;
-
-            res = res.replace(/<think>[\s\S]*?<\/think>/g, '').trim();
+            let responseText = completion.choices[0].message.content;
+            if (typeof responseText === 'string') {
+                responseText = responseText.replace(/<thinking>/g, '<think>').replace(/<\/thinking>/g, '</think>');
            }
-        catch(err) {
+            log(JSON.stringify(messages), responseText);
+            // Original cleaning of <think> tags for the *returned* response (not affecting log)
+            responseText = responseText.replace(/<think>[\s\S]*?<\/think>/g, '').trim();
+            return responseText;
+        } catch(err) {
            if (err.message.includes("content must be a string")) {
                res = "Vision is only supported by certain models.";
            } else {
@ -67,29 +72,54 @@ export class GroqCloudAPI {
                res = "My brain disconnected, try again.";
            }
            console.log(err);
+            if (typeof res === 'string') {
+                res = res.replace(/<thinking>/g, '<think>').replace(/<\/thinking>/g, '</think>');
            }
+            log(JSON.stringify(messages), res);
            return res;
        }
+    }

-    async sendVisionRequest(messages, systemMessage, imageBuffer) {
-        const imageMessages = messages.filter(message => message.role !== 'system');
+    async sendVisionRequest(original_turns, systemMessage, imageBuffer) {
+        const imageMessages = [...original_turns];
        imageMessages.push({
            role: "user",
            content: [
                { type: "text", text: systemMessage },
                {
                    type: "image_url",
-                    image_url: {
-                        url: `data:image/jpeg;base64,${imageBuffer.toString('base64')}`
-                    }
+                    image_url: { url: `data:image/jpeg;base64,${imageBuffer.toString('base64')}` }
                }
            ]
        });

-        return this.sendRequest(imageMessages);
+        const res = await this.sendRequest(imageMessages, systemMessage);
+
+        if (imageBuffer && res) {
+            logVision(original_turns, imageBuffer, res, systemMessage);
+        }
+        return res;
    }

    async embed(_) {
        throw new Error('Embeddings are not supported by Groq.');
    }
 }
+
+export class GroqCloudTTS {
+  constructor() {
+    this.groq = new Groq({ apiKey: getKey('GROQCLOUD_API_KEY') });
+  }
+
+  async transcribe(filePath, options = {}) {
+    const transcription = await this.groq.audio.transcriptions.create({
+      file: fs.createReadStream(filePath),
+      model: options.model || "distil-whisper-large-v3-en", // or "whisper-large-v3-turbo"
+      prompt: options.prompt || "",
+      response_format: options.response_format || "json",
+      language: options.language || "en",
+      temperature: options.temperature !== undefined ? options.temperature : 0.0,
+    });
+    return transcription.text;
+  }
+}
--- a/src/models/huggingface.js
+++ b/src/models/huggingface.js
@ -1,31 +1,32 @@
 import { toSinglePrompt } from '../utils/text.js';
 import { getKey } from '../utils/keys.js';
 import { HfInference } from "@huggingface/inference";
+import { log, logVision } from '../../logger.js';

 export class HuggingFace {
  constructor(model_name, url, params) {
-    // Remove 'huggingface/' prefix if present
    this.model_name = model_name.replace('huggingface/', '');
    this.url = url;
    this.params = params;
-
    if (this.url) {
      console.warn("Hugging Face doesn't support custom urls!");
    }
-
    this.huggingface = new HfInference(getKey('HUGGINGFACE_API_KEY'));
+    // Direct image data in sendRequest is not supported by this wrapper.
+    // HuggingFace Inference API has other methods for vision tasks.
+    this.supportsRawImageInput = false;
  }

-  async sendRequest(turns, systemMessage) {
+  async sendRequest(turns, systemMessage, imageData = null) {
+    if (imageData) {
+      console.warn(`[HuggingFace] Warning: imageData provided to sendRequest, but this method in huggingface.js does not support direct image data embedding for model ${this.model_name}. The image will be ignored.`);
+    }
    const stop_seq = '***';
-    // Build a single prompt from the conversation turns
    const prompt = toSinglePrompt(turns, null, stop_seq);
-    // Fallback model if none was provided
    const model_name = this.model_name || 'meta-llama/Meta-Llama-3-8B';
-    // Combine system message with the prompt
-    const input = systemMessage + "\n" + prompt;
-
-    // We'll try up to 5 times in case of partial <think> blocks for DeepSeek-R1 models.
+    const logInputMessages = [{role: 'system', content: systemMessage}, ...turns];
+    const input = systemMessage + "
+" + prompt;
    const maxAttempts = 5;
    let attempt = 0;
    let finalRes = null;
@ -35,7 +36,6 @@ export class HuggingFace {
      console.log(`Awaiting Hugging Face API response... (model: ${model_name}, attempt: ${attempt})`);
      let res = '';
      try {
-        // Consume the streaming response chunk by chunk
        for await (const chunk of this.huggingface.chatCompletionStream({
          model: model_name,
          messages: [{ role: "user", content: input }],
@ -46,36 +46,32 @@ export class HuggingFace {
      } catch (err) {
        console.log(err);
        res = 'My brain disconnected, try again.';
-        // Break out immediately; we only retry when handling partial <think> tags.
        break;
      }

-      // If the model is DeepSeek-R1, check for mismatched <think> blocks.
      const hasOpenTag = res.includes("<think>");
      const hasCloseTag = res.includes("</think>");

-        // If there's a partial mismatch, warn and retry the entire request.
      if ((hasOpenTag && !hasCloseTag)) {
        console.warn("Partial <think> block detected. Re-generating...");
-          continue;
+        if (attempt < maxAttempts) continue;
      }
-
-        // If both tags are present, remove the <think> block entirely.
      if (hasOpenTag && hasCloseTag) {
        res = res.replace(/<think>[\s\S]*?<\/think>/g, '').trim();
      }
-
      finalRes = res;
-      break; // Exit loop if we got a valid response.
+      break;
    }

-    // If no valid response was obtained after max attempts, assign a fallback.
    if (finalRes == null) {
-      console.warn("Could not get a valid <think> block or normal response after max attempts.");
+      console.warn("Could not get a valid response after max attempts.");
      finalRes = 'I thought too hard, sorry, try again.';
    }
    console.log('Received.');
-    console.log(finalRes);
+    if (typeof finalRes === 'string') {
+        finalRes = finalRes.replace(/<thinking>/g, '<think>').replace(/<\/thinking>/g, '</think>');
+    }
+    log(JSON.stringify(logInputMessages), finalRes);
    return finalRes;
  }

--- a/src/models/hyperbolic.js
+++ b/src/models/hyperbolic.js
@ -1,30 +1,26 @@
 import { getKey } from '../utils/keys.js';
+import { log, logVision } from '../../logger.js';

 export class Hyperbolic {
    constructor(modelName, apiUrl) {
        this.modelName = modelName || "deepseek-ai/DeepSeek-V3";
        this.apiUrl = apiUrl || "https://api.hyperbolic.xyz/v1/chat/completions";

-        // Retrieve the Hyperbolic API key from keys.js
        this.apiKey = getKey('HYPERBOLIC_API_KEY');
        if (!this.apiKey) {
            throw new Error('HYPERBOLIC_API_KEY not found. Check your keys.js file.');
        }
+
+        // Direct image data in sendRequest is not supported by this wrapper.
+        this.supportsRawImageInput = false;
    }

-    /**
-     * Sends a chat completion request to the Hyperbolic endpoint.
-     *
-     * @param {Array} turns - An array of message objects, e.g. [{role: 'user', content: 'Hi'}].
-     * @param {string} systemMessage - The system prompt or instruction.
-     * @param {string} stopSeq - A stopping sequence, default '***'.
-     * @returns {Promise<string>} - The model's reply.
-     */
-    async sendRequest(turns, systemMessage, stopSeq = '***') {
-        // Prepare the messages with a system prompt at the beginning
+    async sendRequest(turns, systemMessage, imageData = null, stopSeq = '***') {
+        if (imageData) {
+            console.warn(`[Hyperbolic] Warning: imageData provided to sendRequest, but this method in hyperbolic.js does not support direct image data embedding for model ${this.modelName}. The image will be ignored.`);
+        }
        const messages = [{ role: 'system', content: systemMessage }, ...turns];

-        // Build the request payload
        const payload = {
            model: this.modelName,
            messages: messages,
@ -32,18 +28,24 @@ export class Hyperbolic {
            temperature: 0.7,
            top_p: 0.9,
            stream: false
+            // stop: stopSeq, // Hyperbolic API might not support stop sequences in the same way or at all.
+                           // If it does, it might need to be formatted differently or might not be part of standard payload.
+                           // For now, commenting out if it causes issues or is not standard.
        };
+        if (stopSeq && stopSeq !== '***') { // Only add stop if it's meaningful and not the default placeholder
+            payload.stop = stopSeq;
+        }
+

        const maxAttempts = 5;
        let attempt = 0;
        let finalRes = null;
+        let rawCompletionContent = null;

        while (attempt < maxAttempts) {
            attempt++;
            console.log(`Awaiting Hyperbolic API response... (attempt: ${attempt})`);
-            console.log('Messages:', messages);

-            let completionContent = null;
          
            try {
                const response = await fetch(this.apiUrl, {
@ -56,54 +58,65 @@ export class Hyperbolic {
                });

                if (!response.ok) {
-                    throw new Error(`HTTP error! status: ${response.status}`);
+                    // Attempt to read error body for more details
+                    let errorBody = "No additional error details.";
+                    try {
+                        errorBody = await response.text();
+                    } catch (e) { /* ignore if error body can't be read */ }
+                    throw new Error(`HTTP error! status: ${response.status}, message: ${errorBody}`);
                }

+                if (!response.ok) {
+                    throw new Error(`HTTP error! status: ${response.status}`);
+                }
                const data = await response.json();
                if (data?.choices?.[0]?.finish_reason === 'length') {
                    throw new Error('Context length exceeded');
                }

-                completionContent = data?.choices?.[0]?.message?.content || '';
+                rawCompletionContent = data?.choices?.[0]?.message?.content || '';
                console.log('Received response from Hyperbolic.');
            } catch (err) {
-                if (
-                    (err.message === 'Context length exceeded' || err.code === 'context_length_exceeded') &&
-                    turns.length > 1
-                ) {
+                if ((err.message === 'Context length exceeded' || err.code === 'context_length_exceeded') && turns.length > 1) {
                    console.log('Context length exceeded, trying again with a shorter context...');
-                    return await this.sendRequest(turns.slice(1), systemMessage, stopSeq);
+                    return await this.sendRequest(turns.slice(1), systemMessage, imageData, stopSeq);
                } else {
                    console.error(err);
-                    completionContent = 'My brain disconnected, try again.';
+                    rawCompletionContent = 'My brain disconnected, try again.';
+                    finalRes = rawCompletionContent;
+                    break;
                }
            }

-            // Check for <think> blocks
-            const hasOpenTag = completionContent.includes("<think>");
-            const hasCloseTag = completionContent.includes("</think>");
+            let processedContent = rawCompletionContent;
+            const hasOpenTag = processedContent.includes("<think>");
+            const hasCloseTag = processedContent.includes("</think>");

            if ((hasOpenTag && !hasCloseTag)) {
                console.warn("Partial <think> block detected. Re-generating...");
-                continue; // Retry the request
+                if (attempt < maxAttempts) continue;
            }
-
            if (hasCloseTag && !hasOpenTag) {
-                completionContent = '<think>' + completionContent;
+                processedContent = '<think>' + processedContent;
            }
-
            if (hasOpenTag && hasCloseTag) {
-                completionContent = completionContent.replace(/<think>[\s\S]*?<\/think>/g, '').trim();
+                processedContent = processedContent.replace(/<think>[\s\S]*?<\/think>/g, '').trim();
+            }
+            finalRes = processedContent.replace(/<\|separator\|>/g, '*no response*');
+            if (!(hasOpenTag && !hasCloseTag && attempt < maxAttempts)) {
+                break;
            }
-
-            finalRes = completionContent.replace(/<\|separator\|>/g, '*no response*');
-            break; // Valid response obtained—exit loop
        }

        if (finalRes == null) {
-            console.warn("Could not get a valid <think> block or normal response after max attempts.");
-            finalRes = 'I thought too hard, sorry, try again.';
+            finalRes = rawCompletionContent || 'I thought too hard, sorry, try again.';
+            finalRes = finalRes.replace(/<\|separator\|>/g, '*no response*');
        }
+
+        if (typeof finalRes === 'string') {
+            finalRes = finalRes.replace(/<thinking>/g, '<think>').replace(/<\/thinking>/g, '</think>');
+        }
+        log(JSON.stringify(messages), finalRes);
        return finalRes;
    }

--- a/src/models/local.js
+++ b/src/models/local.js
@ -1,4 +1,5 @@
 import { strictFormat } from '../utils/text.js';
+import { log, logVision } from '../../logger.js';

 export class Local {
    constructor(model_name, url, params) {
@ -7,14 +8,37 @@ export class Local {
        this.url = url || 'http://127.0.0.1:11434';
        this.chat_endpoint = '/api/chat';
        this.embedding_endpoint = '/api/embeddings';
+        // Note: Actual multimodal support depends on the specific Ollama model (e.g., LLaVA, BakLLaVA)
+        this.supportsRawImageInput = true;
    }

-    async sendRequest(turns, systemMessage) {
-        let model = this.model_name || 'llama3.1'; // Updated to llama3.1, as it is more performant than llama3
+    async sendRequest(turns, systemMessage, imageData = null) {
+        let model = this.model_name || 'sweaterdog/andy-4:latest'; // Changed to Andy-4
        let messages = strictFormat(turns);
        messages.unshift({ role: 'system', content: systemMessage });

-        // We'll attempt up to 5 times for models with deepseek-r1-esk reasoning if the <think> tags are mismatched.
+        if (imageData) {
+            console.warn(`[Ollama] imageData provided. Ensure the configured Ollama model ('${model}') is multimodal (e.g., llava, bakllava) to process images.`);
+            let lastUserMessageIndex = -1;
+            for (let i = messages.length - 1; i >= 0; i--) {
+                if (messages[i].role === 'user') {
+                    lastUserMessageIndex = i;
+                    break;
+                }
+            }
+
+            if (lastUserMessageIndex !== -1) {
+                if (!messages[lastUserMessageIndex].images) {
+                    messages[lastUserMessageIndex].images = [];
+                }
+                messages[lastUserMessageIndex].images.push(imageData.toString('base64'));
+            } else {
+                console.warn('[Ollama] imageData provided, but no user message found to attach it to. Image not sent.');
+                // Or, could create a new user message:
+                // messages.push({ role: 'user', content: "Image attached.", images: [imageData.toString('base64')] });
+            }
+        }
+        
        const maxAttempts = 5;
        let attempt = 0;
        let finalRes = null;
@ -24,14 +48,14 @@ export class Local {
            console.log(`Awaiting local response... (model: ${model}, attempt: ${attempt})`);
            let res = null;
            try {
-                res = await this.send(this.chat_endpoint, {
+                let apiResponse = await this.send(this.chat_endpoint, {
                    model: model,
                    messages: messages,
                    stream: false,
                    ...(this.params || {})
                });
-                if (res) {
-                    res = res['message']['content'];
+                if (apiResponse) {
+                    res = apiResponse['message']['content'];
                } else {
                    res = 'No response data.';
                }
@ -43,38 +67,33 @@ export class Local {
                    console.log(err);
                    res = 'My brain disconnected, try again.';
                }
-
            }

-            // If the model name includes "deepseek-r1" or "Andy-3.5-reasoning", then handle the <think> block.
            const hasOpenTag = res.includes("<think>");
            const hasCloseTag = res.includes("</think>");

-                // If there's a partial mismatch, retry to get a complete response.
            if ((hasOpenTag && !hasCloseTag)) {
                console.warn("Partial <think> block detected. Re-generating...");
-                    continue; 
+                if (attempt < maxAttempts) continue;
            }
-            
-                // If </think> is present but <think> is not, prepend <think>
            if (hasCloseTag && !hasOpenTag) {
                res = '<think>' + res;
            }
-                // Changed this so if the model reasons, using <think> and </think> but doesn't start the message with <think>, <think> ges prepended to the message so no error occur.
-            
-                // If both tags appear, remove them (and everything inside).
            if (hasOpenTag && hasCloseTag) {
-                    res = res.replace(/<think>[\s\S]*?<\/think>/g, '');
+                res = res.replace(/<think>[\s\S]*?<\/think>/g, '').trim();
            }
-
            finalRes = res;
-            break; // Exit the loop if we got a valid response.
+            break;
        }

        if (finalRes == null) {
-            console.warn("Could not get a valid <think> block or normal response after max attempts.");
+            console.warn("Could not get a valid response after max attempts.");
            finalRes = 'I thought too hard, sorry, try again.';
        }
+        if (typeof finalRes === 'string') {
+            finalRes = finalRes.replace(/<thinking>/g, '<think>').replace(/<\/thinking>/g, '</think>');
+        }
+        log(JSON.stringify(messages), finalRes);
        return finalRes;
    }

--- a/src/models/mistral.js
+++ b/src/models/mistral.js
@ -1,19 +1,17 @@
 import { Mistral as MistralClient } from '@mistralai/mistralai';
 import { getKey } from '../utils/keys.js';
 import { strictFormat } from '../utils/text.js';
+import { log, logVision } from '../../logger.js';

 export class Mistral {
    #client;
-
    constructor(model_name, url, params) {
        this.model_name = model_name;
        this.params = params;

        if (typeof url === "string") {
            console.warn("Mistral does not support custom URL's, ignoring!");
-
        }
-
        if (!getKey("MISTRAL_API_KEY")) {
            throw new Error("Mistral API Key missing, make sure to set MISTRAL_API_KEY in settings.json")
        }
@ -23,37 +21,31 @@ export class Mistral {
                apiKey: getKey("MISTRAL_API_KEY")
            }
        );
+        this.supportsRawImageInput = false; // Standard chat completions may not support raw images for all models.

        
-        // Prevents the following code from running when model not specified
-        if (typeof this.model_name === "undefined") return;
-
-        // get the model name without the "mistral" or "mistralai" prefix
-        // e.g "mistral/mistral-large-latest" -> "mistral-large-latest"
-        if (typeof model_name.split("/")[1] !== "undefined") {
-            this.model_name = model_name.split("/")[1];
+        if (typeof this.model_name === "string" && typeof this.model_name.split("/")[1] !== "undefined") {
+            this.model_name = this.model_name.split("/")[1];
        }
    }

-    async sendRequest(turns, systemMessage) {
+    async sendRequest(turns, systemMessage, imageData = null) {
+        if (imageData) {
+            console.warn(`[Mistral] Warning: imageData provided to sendRequest, but this method in mistral.js currently does not support direct image data embedding for model ${this.model_name}. The image will be ignored. Use sendVisionRequest for models/endpoints that support vision, or ensure the API/model used by sendRequest can handle images in its standard chat format.`);
+            // imageData is ignored for now.
+        }

        let result;
-
-        try {
        const model = this.model_name || "mistral-large-latest";
-
-            const messages = [
-                { role: "system", content: systemMessage }
-            ];
+        const messages = [{ role: "system", content: systemMessage }];
        messages.push(...strictFormat(turns));
-
+        try {
            console.log('Awaiting mistral api response...')
            const response  = await this.#client.chat.complete({
                model,
                messages,
                ...(this.params || {})
            });
-
            result = response.choices[0].message.content;
        } catch (err) {
            if (err.message.includes("A request containing images has been given to a model which does not have the 'vision' capability.")) {
@ -63,24 +55,28 @@ export class Mistral {
            }
            console.log(err);
        }
-
+        if (typeof result === 'string') {
+            result = result.replace(/<thinking>/g, '<think>').replace(/<\/thinking>/g, '</think>');
+        }
+        log(JSON.stringify(messages), result);
        return result;
    }

-    async sendVisionRequest(messages, systemMessage, imageBuffer) {
-        const imageMessages = [...messages];
-        imageMessages.push({
-            role: "user",
-            content: [
-                { type: "text", text: systemMessage },
-                {
+    async sendVisionRequest(original_turns, systemMessage, imageBuffer) {
+        const imageFormattedTurns = [...original_turns];
+        const userMessageContent = [{ type: "text", text: systemMessage }];
+        userMessageContent.push({
            type: "image_url",
            imageUrl: `data:image/jpeg;base64,${imageBuffer.toString('base64')}`
-                }
-            ]
        });
+        imageFormattedTurns.push({ role: "user", content: userMessageContent });
        
-        return this.sendRequest(imageMessages, systemMessage);
+        const res = await this.sendRequest(imageFormattedTurns, systemMessage);
+
+        if (imageBuffer && res) {
+            logVision(original_turns, imageBuffer, res, systemMessage);
+        }
+        return res;
    }

    async embed(text) {
--- a/src/models/novita.js
+++ b/src/models/novita.js
@ -1,6 +1,7 @@
 import OpenAIApi from 'openai';
 import { getKey } from '../utils/keys.js';
 import { strictFormat } from '../utils/text.js';
+import { log, logVision } from '../../logger.js';

 // llama, mistral
 export class Novita {
@ -16,9 +17,14 @@ export class Novita {
    config.apiKey = getKey('NOVITA_API_KEY');

    this.openai = new OpenAIApi(config);
+    // Direct image data in sendRequest is not supported by this wrapper.
+    this.supportsRawImageInput = false;
  }

-	async sendRequest(turns, systemMessage, stop_seq='***') {
+	async sendRequest(turns, systemMessage, imageData = null, stop_seq='***') {
+    if (imageData) {
+      console.warn(`[Novita] Warning: imageData provided to sendRequest, but this method in novita.js does not support direct image data embedding for model ${this.model_name}. The image will be ignored.`);
+    }
    let messages = [{'role': 'system', 'content': systemMessage}].concat(turns);

      
@ -43,20 +49,29 @@ export class Novita {
      catch (err) {
          if ((err.message == 'Context length exceeded' || err.code == 'context_length_exceeded') && turns.length > 1) {
              console.log('Context length exceeded, trying again with shorter context.');
-              return await sendRequest(turns.slice(1), systemMessage, stop_seq);
+              return await this.sendRequest(turns.slice(1), systemMessage, imageData, stop_seq); // Added this. and imageData
          } else {
            console.log(err);
              res = 'My brain disconnected, try again.';
          }
      }
-      if (res.includes('<think>')) {
+      if (typeof res === 'string') {
+          res = res.replace(/<thinking>/g, '<think>').replace(/<\/thinking>/g, '</think>');
+      }
+      log(JSON.stringify(messages), res); // Log transformed res
+
+      // Existing stripping logic for <think> tags
+      if (res && typeof res === 'string' && res.includes('<think>')) {
          let start = res.indexOf('<think>');
-        let end = res.indexOf('</think>') + 8;
-        if (start != -1) {
-          if (end != -1) {
+          let end = res.indexOf('</think>') + 8; // length of '</think>'
+          if (start !== -1) { // Ensure '<think>' was found
+              if (end !== -1 && end > start + 7) { // Ensure '</think>' was found and is after '<think>'
                  res = res.substring(0, start) + res.substring(end);
              } else {
-            res = res.substring(0, start+7);
+                  // Malformed or missing end tag, strip from '<think>' onwards or handle as error
+                  // Original code: res = res.substring(0, start+7); This would leave "<think>"
+                  // Let's assume we strip from start if end is not valid.
+                  res = res.substring(0, start);
              }
          }
          res = res.trim();
--- a/src/models/openrouter.js
+++ b/src/models/openrouter.js
@ -1,55 +1,104 @@
 import OpenAIApi from 'openai';
 import { getKey, hasKey } from '../utils/keys.js';
 import { strictFormat } from '../utils/text.js';
+import { log, logVision } from '../../logger.js';

 export class OpenRouter {
    constructor(model_name, url) {
        this.model_name = model_name;
-
        let config = {};
        config.baseURL = url || 'https://openrouter.ai/api/v1';
-
        const apiKey = getKey('OPENROUTER_API_KEY');
        if (!apiKey) {
            console.error('Error: OPENROUTER_API_KEY not found. Make sure it is set properly.');
        }
-
-        // Pass the API key to OpenAI compatible Api
        config.apiKey = apiKey;
-
        this.openai = new OpenAIApi(config);
+        // OpenRouter is a router; individual models might support vision.
+        // This generic sendRequest does not format for vision. Use sendVisionRequest or specific model logic.
+        this.supportsRawImageInput = false;
    }

-    async sendRequest(turns, systemMessage, stop_seq='*') {
+    async sendRequest(turns, systemMessage, imageData = null, stop_seq='*') {
+        if (imageData) {
+            console.warn(`[OpenRouter] Warning: imageData provided to sendRequest. While OpenRouter can route to vision models, this generic method does not format for image data. The image will be ignored. Use sendVisionRequest or ensure your model call through OpenRouter is specifically formatted for vision if needed.`);
+        }
        let messages = [{ role: 'system', content: systemMessage }, ...turns];
        messages = strictFormat(messages);

-        // Choose a valid model from openrouter.ai (for example, "openai/gpt-4o")
        const pack = {
            model: this.model_name,
            messages,
-            stop: stop_seq
+            include_reasoning: true,
+            // stop: stop_seq // Commented out since some API providers on Openrouter do not support a stop sequence, such as Grok 3
        };

+        const maxAttempts = 5;
+        let attempt = 0;
+        let finalRes = null;
+
+        while (attempt < maxAttempts) {
+            attempt++;
+            console.info(`Awaiting openrouter API response... (attempt: ${attempt})`);
            let res = null;
            try {
-            console.log('Awaiting openrouter api response...');
                let completion = await this.openai.chat.completions.create(pack);
                if (!completion?.choices?.[0]) {
                    console.error('No completion or choices returned:', completion);
                    return 'No response received.';
                }
+
+                const logMessages = [{ role: "system", content: processedSystemMessage }].concat(turns);
+
                if (completion.choices[0].finish_reason === 'length') {
                    throw new Error('Context length exceeded');
                }
-            console.log('Received.');
+                
+                if (completion.choices[0].message.reasoning) {
+                    try{
+                        const reasoning = '<think>\n' + completion.choices[0].message.reasoning + '</think>\n';
+                        const content = completion.choices[0].message.content;
+
+                        // --- VISION LOGGING ---
+                        if (visionImageBuffer) {
+                            logVision(turns, visionImageBuffer, reasoning + "\n" + content, visionMessage);
+                        } else {
+                            log(JSON.stringify(logMessages), reasoning + "\n" + content);
+                        }
+                        res = content;
+                    } catch {}
+                } else {
+                    try {
                        res = completion.choices[0].message.content;
+                        if (visionImageBuffer) {
+                            logVision(turns, visionImageBuffer, res, visionMessage);
+                        } else {
+                            log(JSON.stringify(logMessages), res);
+                        }
+                    } catch {
+                        console.warn("Unable to log due to unknown error!");
+                    }
+                }
+                // Trim <think> blocks from the final response if present.
+                if (res && res.includes("<think>") && res.includes("</think>")) {
+                    res = res.replace(/<think>[\s\S]*?<\/think>/g, '').trim();
+                }
+
+                console.info('Received.');
            } catch (err) {
                console.error('Error while awaiting response:', err);
-            // If the error indicates a context-length problem, we can slice the turns array, etc.
                res = 'My brain disconnected, try again.';
            }
-        return res;
+
+            finalRes = res;
+            break; // Exit loop once a valid response is obtained.
+        }
+
+        if (finalRes == null) {
+            console.warn("Could not get a valid <think> block or normal response after max attempts.");
+            finalRes = 'I thought too hard, sorry, try again.';
+        }
+        return finalRes;
    }

    async sendVisionRequest(messages, systemMessage, imageBuffer) {
@ -67,7 +116,10 @@ export class OpenRouter {
            ]
        });
        
-        return this.sendRequest(imageMessages, systemMessage);
+        // sendVisionRequest formats its own message array; sendRequest here should not process new imageData.
+        // Pass systemMessage and stop_seq as originally intended by sendRequest.
+        return this.sendRequest(imageMessages, systemMessage, null, stop_seq);
+
    }

    async embed(text) {
--- a/src/models/prompter.js
+++ b/src/models/prompter.js
@ -334,9 +334,29 @@ export class Prompter {
            let prompt = this.profile.conversing;
            prompt = await this.replaceStrings(prompt, messages, this.convo_examples);
            let generation;
+            let imageData = null;
+
+            if (settings.vision_mode === 'always' && messages.length > 0) {
+                const lastMessage = messages[messages.length - 1];
+                // Check if the last message has an imagePath and if the model supports raw image input
+                if (lastMessage.imagePath && this.chat_model.supportsRawImageInput) {
+                    try {
+                        // Construct the full path to the image file
+                        const agentScreenshotDir = path.join('bots', this.agent.name, 'screenshots');
+                        const imageFullPath = path.join(agentScreenshotDir, lastMessage.imagePath);
+
+                        console.log(`[Prompter] Attempting to read image for always_active mode: ${imageFullPath}`);
+                        imageData = await fs.readFile(imageFullPath); // Read as buffer
+                        console.log('[Prompter] Image data prepared for chat model.');
+                    } catch (err) {
+                        console.error(`[Prompter] Error reading image file ${lastMessage.imagePath}:`, err);
+                        imageData = null; // Proceed without image data if reading fails
+                    }
+                }
+            }

            try {
-                generation = await this.chat_model.sendRequest(messages, prompt);
+                generation = await this.chat_model.sendRequest(messages, prompt, imageData);
                if (typeof generation !== 'string') {
                    console.error('Error: Generated response is not a string', generation);
                    throw new Error('Generated response is not a string');
@ -445,8 +465,26 @@ export class Prompter {
    }

    async _saveLog(prompt, messages, generation, tag) {
-        if (!settings.log_all_prompts)
+        // NEW LOGIC STARTS
+        switch (tag) {
+            case 'conversation':
+            case 'coding': // Assuming coding logs fall under normal data
+            case 'memSaving':
+                if (!settings.log_normal_data) return;
+                break;
+            // Add case for 'vision' if prompter.js starts logging vision prompts/responses via _saveLog
+            // case 'vision':
+            //     if (!settings.log_vision_data) return;
+            //     break;
+            default:
+                // If it's an unknown tag, perhaps log it if general logging is on, or ignore.
+                // For safety, let's assume if it's not specified, it doesn't get logged unless a general flag is on.
+                // However, the goal is to use specific flags. So, if a new tag appears, this logic should be updated.
+                // For now, if it doesn't match known tags that map to a setting, it won't log.
                return;
+        }
+        // NEW LOGIC ENDS
+
        const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
        let logEntry;
        let task_id = this.agent.task.task_id;
--- a/src/models/qwen.js
+++ b/src/models/qwen.js
@ -1,6 +1,7 @@
 import OpenAIApi from 'openai';
 import { getKey, hasKey } from '../utils/keys.js';
 import { strictFormat } from '../utils/text.js';
+import { log, logVision } from '../../logger.js';

 export class Qwen {
    constructor(model_name, url, params) {
@ -12,15 +13,51 @@ export class Qwen {
        config.apiKey = getKey('QWEN_API_KEY');

        this.openai = new OpenAIApi(config);
+        // Note: Actual multimodal support depends on the specific Qwen model (e.g., qwen-vl-plus)
+        this.supportsRawImageInput = true;
    }

-    async sendRequest(turns, systemMessage, stop_seq='***') {
+    async sendRequest(turns, systemMessage, imageData = null, stop_seq = '***') {
        let messages = [{'role': 'system', 'content': systemMessage}].concat(turns);
-
        messages = strictFormat(messages);

+        if (imageData) {
+            // Qwen VL models include names like "qwen-vl-plus", "qwen-vl-max", "qwen-vl-chat-v1"
+            if (!this.model_name || !this.model_name.toLowerCase().includes('-vl')) {
+                console.warn(`[Qwen] Warning: imageData provided for model ${this.model_name}, which does not appear to be a Qwen Vision-Language (VL) model. The image may be ignored or cause an error.`);
+            }
+
+            let lastUserMessageIndex = -1;
+            for (let i = messages.length - 1; i >= 0; i--) {
+                if (messages[i].role === 'user') {
+                    lastUserMessageIndex = i;
+                    break;
+                }
+            }
+
+            if (lastUserMessageIndex !== -1) {
+                const userMessage = messages[lastUserMessageIndex];
+                if (typeof userMessage.content === 'string') { // Ensure content is a string before converting
+                    userMessage.content = [
+                        { "text": userMessage.content },
+                        { "image": `data:image/jpeg;base64,${imageData.toString('base64')}` }
+                    ];
+                } else if (Array.isArray(userMessage.content)) {
+                    // If content is already an array (e.g. from previous image), add new image
+                     userMessage.content.push({ "image": `data:image/jpeg;base64,${imageData.toString('base64')}` });
+                } else {
+                    console.warn('[Qwen] Last user message content is not a string or array. Creating new content array for image.');
+                    userMessage.content = [{ "image": `data:image/jpeg;base64,${imageData.toString('base64')}` }];
+                }
+            } else {
+                console.warn('[Qwen] imageData provided, but no user message found to attach it to. Image not sent.');
+                // Alternative: Create a new user message with the image
+                // messages.push({ role: 'user', content: [{ "image": `data:image/jpeg;base64,${imageData.toString('base64')}` }] });
+            }
+        }
+
        const pack = {
-            model: this.model_name || "qwen-plus",
+            model: this.model_name || "qwen-plus", // Default might need to be a VL model if images are common
            messages,
            stop: stop_seq,
            ...(this.params || {})
@ -45,6 +82,10 @@ export class Qwen {
                res = 'My brain disconnected, try again.';
            }
        }
+        if (typeof res === 'string') {
+            res = res.replace(/<thinking>/g, '<think>').replace(/<\/thinking>/g, '</think>');
+        }
+        log(JSON.stringify(messages), res);
        return res;
    }

--- a/src/models/replicate.js
+++ b/src/models/replicate.js
@ -1,6 +1,7 @@
 import Replicate from 'replicate';
 import { toSinglePrompt } from '../utils/text.js';
 import { getKey } from '../utils/keys.js';
+import { log, logVision } from '../../logger.js';

 // llama, mistral
 export class ReplicateAPI {
@ -16,13 +17,20 @@ export class ReplicateAPI {
 		this.replicate = new Replicate({
 			auth: getKey('REPLICATE_API_KEY'),
 		});
+		// Direct image data in sendRequest is not supported by this wrapper.
+		// Replicate handles vision models differently, often with specific inputs like "image".
+		this.supportsRawImageInput = false;
 	}

-	async sendRequest(turns, systemMessage) {
+	async sendRequest(turns, systemMessage, imageData = null) {
+		if (imageData) {
+			console.warn(`[ReplicateAPI] Warning: imageData provided to sendRequest, but this method in replicate.js does not support direct image data embedding for model ${this.model_name}. The image will be ignored. Replicate models with vision capabilities usually require specific input fields like 'image' with a URL or base64 string.`);
+		}
 		const stop_seq = '***';
 		const prompt = toSinglePrompt(turns, null, stop_seq);
 		let model_name = this.model_name || 'meta/meta-llama-3-70b-instruct';

+		const logInputMessages = [{role: 'system', content: systemMessage}, ...turns];
 		const input = { 
 			prompt, 
 			system_prompt: systemMessage,
@ -45,6 +53,10 @@ export class ReplicateAPI {
 			console.log(err);
 			res = 'My brain disconnected, try again.';
 		}
+		if (typeof res === 'string') {
+            res = res.replace(/<thinking>/g, '<think>').replace(/<\/thinking>/g, '</think>');
+        }
+		log(JSON.stringify(logInputMessages), res);
 		console.log('Received.');
 		return res;
 	}
--- a/src/models/vllm.js
+++ b/src/models/vllm.js
@ -1,9 +1,13 @@
 // This code uses Dashscope and HTTP to ensure the latest support for the Qwen model.
 // Qwen is also compatible with the OpenAI API format;

+// This code uses Dashscope and HTTP to ensure the latest support for the Qwen model.
+// Qwen is also compatible with the OpenAI API format;
+
 import OpenAIApi from 'openai';
 import { getKey, hasKey } from '../utils/keys.js';
 import { strictFormat } from '../utils/text.js';
+import { log, logVision } from '../../logger.js';

 export class VLLM {
    constructor(model_name, url) {
@ -19,9 +23,15 @@ export class VLLM {
        vllm_config.apiKey = ""

        this.vllm = new OpenAIApi(vllm_config);
+        // VLLM can serve various models. This generic sendRequest does not format for vision.
+        // Specific multimodal models served via VLLM might require custom request formatting.
+        this.supportsRawImageInput = false;
    }

-    async sendRequest(turns, systemMessage, stop_seq = '***') {
+    async sendRequest(turns, systemMessage, imageData = null, stop_seq = '***') {
+        if (imageData) {
+            console.warn(`[VLLM] Warning: imageData provided to sendRequest, but this method in vllm.js does not support direct image data embedding for model ${this.model_name}. The image will be ignored. Ensure the VLLM endpoint is configured for a multimodal model and the request is formatted accordingly if vision is intended.`);
+        }
        let messages = [{ 'role': 'system', 'content': systemMessage }].concat(turns);
        
        if (this.model_name.includes('deepseek') || this.model_name.includes('qwen')) {
@ -47,12 +57,16 @@ export class VLLM {
        catch (err) {
            if ((err.message == 'Context length exceeded' || err.code == 'context_length_exceeded') && turns.length > 1) {
                console.log('Context length exceeded, trying again with shorter context.');
-                return await this.sendRequest(turns.slice(1), systemMessage, stop_seq);
+                return await this.sendRequest(turns.slice(1), systemMessage, imageData, stop_seq);
            } else {
                console.log(err);
                res = 'My brain disconnected, try again.';
            }
        }
+        if (typeof res === 'string') {
+            res = res.replace(/<thinking>/g, '<think>').replace(/<\/thinking>/g, '</think>');
+        }
+        log(JSON.stringify(messages), res);
        return res;
    }

--- a/src/process/tts_process.js
+++ b/src/process/tts_process.js
@ -0,0 +1,247 @@
+import settings from '../../settings.js';
+import { GroqCloudTTS } from '../models/groq.js';
+import portAudio from 'naudiodon';
+const { AudioIO, SampleFormat16Bit } = portAudio;
+import wav from 'wav';
+import fs from 'fs';
+import path from 'path';
+import { fileURLToPath } from 'url';
+
+// Import getIO and our new function getAllInGameAgentNames
+import { getIO, getAllInGameAgentNames } from '../server/mind_server.js';
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+
+/**
+ * Delete leftover speech_*.wav from previous runs
+ */
+const leftover = fs.readdirSync(__dirname).filter(f => /^speech_\d+\.wav$/.test(f));
+for (const file of leftover) {
+  try {
+    fs.unlinkSync(path.join(__dirname, file));
+  } catch (_) {
+    // ignore errors
+  }
+}
+
+// Configuration
+const RMS_THRESHOLD = 500;     // Lower threshold for faint audio
+const SILENCE_DURATION = 2000; // 2 seconds of silence after speech => stop
+const SAMPLE_RATE = 16000;
+const BIT_DEPTH = 16;
+const STT_USERNAME = settings.stt_username || "SERVER"; // Name that appears as sender
+const STT_AGENT_NAME = settings.stt_agent_name || "";   // If blank, broadcast to all
+
+// Guards to prevent multiple overlapping recordings
+let isRecording = false;  // Ensures only one recordAndTranscribeOnce at a time
+let sttRunning = false;   // Ensures continuousLoop is started only once
+
+/**
+ * Records one session, transcribes, and sends to MindServer as a chat message
+ */
+async function recordAndTranscribeOnce() {
+  // If another recording is in progress, just skip
+  if (isRecording) {
+    console.log("Another recording is still in progress; skipping new record attempt.");
+    return null;
+  }
+  isRecording = true;
+
+  const outFile = path.join(__dirname, `speech_${Date.now()}.wav`);
+  const fileWriter = new wav.FileWriter(outFile, {
+    channels: 1,
+    sampleRate: SAMPLE_RATE,
+    bitDepth: BIT_DEPTH
+  });
+  const ai = new AudioIO({
+    inOptions: {
+      channelCount: 1,
+      sampleFormat: SampleFormat16Bit,
+      sampleRate: SAMPLE_RATE,
+      deviceId: -1,
+      closeOnError: true
+    }
+  });
+
+  let recording = true;
+  let hasHeardSpeech = false;
+  let silenceTimer = null;
+  let finished = false; // Guard to ensure final processing is done only once
+
+  // Helper to reset silence timer
+  function resetSilenceTimer() {
+    if (silenceTimer) clearTimeout(silenceTimer);
+    if (hasHeardSpeech) {
+      silenceTimer = setTimeout(() => stopRecording(), SILENCE_DURATION);
+    }
+  }
+
+  // Stop recording
+  function stopRecording() {
+    if (!recording) return;
+    recording = false;
+    ai.quit();
+    fileWriter.end();
+  }
+
+  // We wrap everything in a promise so we can await the transcription
+  return new Promise((resolve, reject) => {
+    // Attach event handlers
+    ai.on('data', (chunk) => {
+      fileWriter.write(chunk);
+
+      // Calculate RMS for threshold detection
+      let sumSquares = 0;
+      const sampleCount = chunk.length / 2;
+      for (let i = 0; i < chunk.length; i += 2) {
+        const sample = chunk.readInt16LE(i);
+        sumSquares += sample * sample;
+      }
+      const rms = Math.sqrt(sumSquares / sampleCount);
+
+      // If RMS passes threshold, we've heard speech
+      if (rms > RMS_THRESHOLD) {
+        if (!hasHeardSpeech) {
+          hasHeardSpeech = true;
+        }
+        resetSilenceTimer();
+      }
+    });
+
+    ai.on('error', (err) => {
+      cleanupListeners();
+      reject(err);
+    });
+
+    fileWriter.on('finish', async () => {
+      if (finished) return;
+      finished = true;
+      try {
+        // Check audio duration
+        const stats = fs.statSync(outFile);
+        const headerSize = 44; // standard WAV header size
+        const dataSize = stats.size - headerSize;
+        const duration = dataSize / (SAMPLE_RATE * (BIT_DEPTH / 8));
+        if (duration < 2.75) {
+          console.log("Audio too short (<2.75s); discarding.");
+          fs.unlink(outFile, () => {});
+          cleanupListeners();
+          return resolve(null);
+        }
+
+        // Transcribe
+        const groqTTS = new GroqCloudTTS();
+        const text = await groqTTS.transcribe(outFile, {
+          model: "distil-whisper-large-v3-en",
+          prompt: "",
+          response_format: "json",
+          language: "en",
+          temperature: 0.0
+        });
+
+        fs.unlink(outFile, () => {}); // cleanup WAV file
+
+        // Basic check for empty or whitespace
+        if (!text || !text.trim()) {
+          console.log("Transcription empty; discarding.");
+          cleanupListeners();
+          return resolve(null);
+        }
+
+        // Heuristic checks to determine if the transcription is genuine
+        
+        // 1. Ensure at least one alphabetical character
+        if (!/[A-Za-z]/.test(text)) {
+          console.log("Transcription has no letters; discarding.");
+          cleanupListeners();
+          return resolve(null);
+        }
+
+        // 2. Check for gibberish repeated sequences
+        if (/([A-Za-z])\1{3,}/.test(text)) {
+          console.log("Transcription looks like gibberish; discarding.");
+          cleanupListeners();
+          return resolve(null);
+        }
+
+        // 3. Check transcription length, with allowed greetings
+        const letterCount = text.replace(/[^A-Za-z]/g, "").length;
+        const normalizedText = text.trim().toLowerCase();
+        const allowedGreetings = new Set(["hi", "hello", "greetings", "hey"]);
+
+        if (letterCount < 8 && !allowedGreetings.has(normalizedText)) {
+          console.log("Transcription too short and not an allowed greeting; discarding.");
+          cleanupListeners();
+          return resolve(null);
+        }
+
+        console.log("Transcription:", text);
+
+        // Format message so it looks like: "[SERVER] message"
+        const finalMessage = `[${STT_USERNAME}] ${text}`;
+
+        // If STT_AGENT_NAME is empty, broadcast to all agents
+        if (!STT_AGENT_NAME.trim()) {
+          const agentNames = getAllInGameAgentNames(); // from mind_server
+          for (const agentName of agentNames) {
+            getIO().emit('send-message', agentName, finalMessage);
+          }
+        } else {
+          // Otherwise, send only to the specified agent
+          getIO().emit('send-message', STT_AGENT_NAME, finalMessage);
+        }
+
+        cleanupListeners();
+        resolve(text);
+      } catch (err) {
+        cleanupListeners();
+        reject(err);
+      }
+    });
+
+    ai.start();
+
+    function cleanupListeners() {
+      ai.removeAllListeners('data');
+      ai.removeAllListeners('error');
+      fileWriter.removeAllListeners('finish');
+      if (silenceTimer) clearTimeout(silenceTimer);
+
+      // release lock
+      isRecording = false;
+    }
+  });
+}
+
+/**
+ * Runs recording sessions sequentially, so only one at a time
+ */
+async function continuousLoop() {
+  while (true) {
+    try {
+      await recordAndTranscribeOnce();
+    } catch (err) {
+      console.error("[STT Error]", err);
+    }
+    // short gap
+    await new Promise(res => setTimeout(res, 1000));
+  }
+}
+
+export function initTTS() {
+  // Only run if stt_transcription is true and we haven't started already
+  if (!settings.stt_transcription) return;
+
+  if (sttRunning) {
+    console.log("STT loop already running; skipping re-init.");
+    return;
+  }
+  sttRunning = true;
+
+  continuousLoop().catch((err) => {
+    console.error("[STT] continuousLoop crashed", err);
+  });
+}
+
+initTTS();
--- a/src/server/mind_server.js
+++ b/src/server/mind_server.js
@ -161,3 +161,6 @@ function stopAllAgents() {
 export const getIO = () => io;
 export const getServer = () => server;
 export const getConnectedAgents = () => connectedAgents; 
+export function getAllInGameAgentNames() {
+    return Object.keys(inGameAgents);
+  }