Add files via upload

2025-07-30 11:55:29 +02:00 · 2025-06-07 18:06:52 -07:00 · 2025-06-07 18:06:52 -07:00 · 29b22349ec
commit 29b22349ec
parent 0902733047
3 changed files with 2099 additions and 0 deletions
--- a/logs/convert.py
+++ b/logs/convert.py
@ -0,0 +1,965 @@
+import csv
+import json
+import logging
+import sys
+import os
+import random
+from typing import List, Dict
+import pandas as pd
+from USERNAMES import Get_Usernames
+from transformers import AutoTokenizer
+from tqdm import tqdm
+import torch
+from PIL import Image
+import base64
+from io import BytesIO
+
+# Try to import pandas-image-methods for vision data handling
+try:
+    from pandas_image_methods import PILMethods
+    PANDAS_IMAGE_METHODS_AVAILABLE = True
+    # Enable PIL methods for pandas
+    pd.api.extensions.register_series_accessor("pil")(PILMethods)
+except ImportError:
+    PANDAS_IMAGE_METHODS_AVAILABLE = False
+    logging.warning("pandas-image-methods not available. Install with: pip install pandas-image-methods")
+
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+# Increase CSV field size limit to avoid errors with very large fields.
+maxInt = sys.maxsize
+while True:
+    try:
+        csv.field_size_limit(maxInt)
+        break
+    except OverflowError:
+        maxInt = int(maxInt/10)
+
+# Define the original usernames.
+ORIGINAL_USERNAMES = [
+    "SweaterDog_YT", "SweaterDog", "Sweaterdog", "Foolish_Pear69", "Farquadthegod72", "Hank",
+    "Gordan", "Perry", "Frederick", "Oliver", "Bill", "Ashley", "Greg", "Treb", "Mia", "Tia", "ALBeRT", "Jason"
+]
+
+# Define outputs that should cause the conversation to be deleted.
+BAD_OUTPUTS = {
+    "My brain just kinda stopped working. Try again.",
+    "My brain disconnected, try again.",
+    "Vision is only supported",
+    "Context length exceeded",
+    "Image input modality is not enabled",
+    "An unexpected error occurred",
+}
+
+MINECRAFT_USERNAMES = list(set(Get_Usernames()))  # Remove duplicates
+duplicate_count = len(Get_Usernames()) - len(MINECRAFT_USERNAMES)
+
+available_minecraft_usernames = list(MINECRAFT_USERNAMES)  # Create a copy for tracking
+
+global username_replaced_count
+global reasoning_replaced_count
+username_replaced_count = 0
+reasoning_replaced_count = 0
+
+def replace_reasoning_prompt(text: str) -> str:
+    global reasoning_replaced_count
+    replaced = False
+    # Optionally, replace the reasoning prompt if needed.
+    if replaced:
+        reasoning_replaced_count += 1
+    return text
+
+def parse_json_safely(text: str) -> List[Dict[str, str]]:
+    try:
+        if text.startswith('[') and '],' in text:
+            parts = text.split('],')
+            text = parts[0] + ']'
+        if text.startswith('"') and text.endswith('"'):
+            text = text[1:-1]
+            text = text.replace('""', '"')
+        data = json.loads(text)
+        if isinstance(data, list) and len(data) > 0 and isinstance(data[0], list):
+            data = data[0]
+        converted_messages = []
+        for msg in data:
+            if isinstance(msg, dict) and 'role' in msg and 'content' in msg:
+                converted_messages.append({
+                    "from": "human" if msg['role'] in ("system", "user") else "gpt",
+                    "value": msg['content']
+                })
+        return converted_messages
+    except Exception as e:
+        logger.debug(f"Error parsing JSON: {e}")  # Suppressed error level
+        return [{
+            "from": "human",
+            "value": text
+        }]
+
+def create_conversation_thread(row: Dict[str, str]) -> List[Dict[str, str]]:
+    messages = []
+    conversation_replacements = {}  # Track username replacements for this conversation ONLY
+    
+    def replace_usernames_in_message(text: str) -> str:
+        global username_replaced_count
+        global available_minecraft_usernames
+        replaced = False
+
+        if not MINECRAFT_USERNAMES:
+            return text
+
+        for orig_name in ORIGINAL_USERNAMES:
+            if orig_name in text:
+                if orig_name not in conversation_replacements:
+                    # If we've used all available names, reset the list
+                    if not available_minecraft_usernames:
+                        available_minecraft_usernames = list(MINECRAFT_USERNAMES)
+                    # Get a random name from the available ones
+                    replacement = random.choice(available_minecraft_usernames)
+                    available_minecraft_usernames.remove(replacement)
+                    conversation_replacements[orig_name] = replacement
+                    replaced = True
+                # Use existing replacement for this conversation
+                text = text.replace(orig_name, conversation_replacements[orig_name])
+
+        if replaced:
+            username_replaced_count += 1
+        return text
+
+    if row.get("input"):
+        messages = parse_json_safely(str(row["input"]))
+        # Apply consistent username replacements to all messages
+        for msg in messages:
+            msg["value"] = replace_usernames_in_message(msg["value"])
+    
+    if row.get("output"):
+        output_text = str(row["output"]).strip()
+        output_text = replace_usernames_in_message(output_text)
+        output_text = replace_reasoning_prompt(output_text)
+        messages.append({
+            "from": "gpt",
+            "value": output_text
+        })
+    
+    return messages
+
+def conversation_has_bad_output(messages: List[Dict[str, str]]) -> bool:
+    for msg in messages:
+        if msg["from"] == "gpt" and msg["value"].strip() in BAD_OUTPUTS:
+            return True
+    return False
+
+def load_image_from_base64(base64_string: str):
+    """Convert base64 string to PIL Image"""
+    try:
+        if base64_string.startswith('data:'):
+            base64_string = base64_string.split(',')[1]
+        
+        image_bytes = base64.b64decode(base64_string)
+        image = Image.open(BytesIO(image_bytes))
+        
+        if image.mode in ('RGBA', 'LA', 'P'):
+            image = image.convert('RGB')
+            
+        return image
+    except Exception as e:
+        logger.debug(f"Error loading image from base64: {e}")
+        return Image.new('RGB', (224, 224), color='gray')
+
+def pil_image_to_parquet_dict(image: Image.Image, filename: str) -> Dict:
+    """Converts a PIL Image to the dictionary format {bytes, path} for Parquet."""
+    img_byte_arr = BytesIO()
+    # Determine a suitable save format
+    save_format = image.format if image.format and image.format in Image.SAVE else 'PNG'
+    
+    # Handle specific mode conversions if necessary for the chosen format
+    if save_format == 'PNG' and image.mode not in ['RGB', 'RGBA', 'L', 'P', 'I', 'F']: # Common PNG modes
+        # Convert to a mode PNG supports, e.g., RGBA to preserve transparency
+        image_to_save = image.convert("RGBA")
+    elif save_format == 'JPEG' and image.mode not in ['RGB', 'L', 'CMYK']:
+        # Convert to a mode JPEG supports
+        image_to_save = image.convert("RGB")
+    else:
+        image_to_save = image
+
+    try:
+        image_to_save.save(img_byte_arr, format=save_format)
+    except Exception as e:
+        logger.warning(f"Could not save image {filename} in format {save_format} (Error: {e}). Attempting PNG.")
+        save_format = 'PNG'
+        if image_to_save.mode not in ['RGB', 'RGBA', 'L', 'P', 'I', 'F']:
+            image_to_save = image.convert("RGBA") # Default to RGBA for PNG
+        image_to_save.save(img_byte_arr, format=save_format)
+        
+    return {"bytes": img_byte_arr.getvalue(), "path": filename}
+
+def extract_vision_data_from_jsonl(jsonl_path: str) -> List[Dict]:
+    """Extract vision data from HuggingFace JSONL metadata format"""
+    if not os.path.isfile(jsonl_path):
+        logger.error(f"JSONL file not found: {jsonl_path}")
+        return []
+    
+    logger.info(f"Reading vision metadata: {jsonl_path}")
+    
+    # Get the directory containing the JSONL file (should contain images folder)
+    base_dir = os.path.dirname(jsonl_path)
+    images_dir = os.path.join(base_dir, 'images')
+    
+    if not os.path.isdir(images_dir):
+        logger.error(f"Images directory not found: {images_dir}")
+        return []
+    
+    vision_data = []
+    
+    with open(jsonl_path, 'r', encoding='utf-8') as f:
+        for line_num, line in enumerate(f, 1):
+            line = line.strip()
+            if not line:
+                continue
+                
+            try:
+                entry = json.loads(line)
+                
+                # Extract required fields - logger.js uses 'input' and 'response', not 'text'
+                file_name = entry.get('file_name', '')
+                input_data = entry.get('input', '')
+                response = entry.get('response', '')
+                
+                if not all([file_name, input_data, response]):
+                    logger.warning(f"Line {line_num}: Missing required fields (file_name, input, response)")
+                    continue
+                
+                # Check for bad outputs
+                if response.strip() in BAD_OUTPUTS:
+                    logger.debug(f"Line {line_num}: Skipping bad output")
+                    continue
+                
+                # Load the image
+                image_path = os.path.join(base_dir, file_name)
+                if not os.path.isfile(image_path):
+                    logger.warning(f"Line {line_num}: Image file not found: {image_path}")
+                    continue
+                
+                try:
+                    image = Image.open(image_path)
+                    if image.mode in ('RGBA', 'LA', 'P') and image.format != 'PNG': # PNG handles these modes well
+                        image = image.convert('RGB') # Convert to RGB if not PNG to simplify, or handle more modes in pil_image_to_parquet_dict
+                except Exception as e:
+                    logger.warning(f"Line {line_num}: Error loading image {image_path}: {e}")
+                    continue
+                
+                # Convert PIL image to parquet-compatible dict
+                relative_image_path_for_dict = file_name # Use the relative path from metadata
+                image_dict = pil_image_to_parquet_dict(image, relative_image_path_for_dict)
+
+                # Create a separate conversation_replacements for each vision entry
+                entry_conversation_replacements = {}
+                
+                # Replace usernames consistently within this single entry
+                def replace_usernames_in_text(text: str) -> str:
+                    global username_replaced_count
+                    global available_minecraft_usernames
+                    replaced = False
+
+                    if not MINECRAFT_USERNAMES:
+                        return text
+
+                    for orig_name in ORIGINAL_USERNAMES:
+                        if orig_name in text:
+                            if orig_name not in entry_conversation_replacements:
+                                if not available_minecraft_usernames:
+                                    available_minecraft_usernames = list(MINECRAFT_USERNAMES)
+                                replacement = random.choice(available_minecraft_usernames)
+                                available_minecraft_usernames.remove(replacement)
+                                entry_conversation_replacements[orig_name] = replacement
+                                replaced = True
+                            text = text.replace(orig_name, entry_conversation_replacements[orig_name])
+
+                    if replaced:
+                        username_replaced_count += 1
+                    return text
+                
+                # Parse the input data (conversation history) and build conversation
+                try:
+                    # The input_data should be JSON string of conversation history
+                    conversation_history = json.loads(input_data)
+                    
+                    # Build the conversation in unsloth format
+                    conversation = []
+                    
+                    if isinstance(conversation_history, list):
+                        for msg in conversation_history:
+                            if isinstance(msg, dict) and 'role' in msg:
+                                role = msg['role']
+                                # Map system messages to user role for simplicity
+                                if role == 'system':
+                                    role = 'user'
+                                
+                                content_parts = []
+                                
+                                # Handle different content formats
+                                if 'content' in msg:
+                                    content = msg['content']
+                                    if isinstance(content, str):
+                                        # Simple string content
+                                        text_content = replace_usernames_in_text(content)
+                                        content_parts.append({"type": "text", "text": text_content})
+                                    elif isinstance(content, list):
+                                        # Array content (multimodal messages)
+                                        for part in content:
+                                            if isinstance(part, dict):
+                                                if part.get('type') == 'text':
+                                                    text_content = part.get('text', '')
+                                                    if text_content:
+                                                        text_content = replace_usernames_in_text(text_content)
+                                                        content_parts.append({"type": "text", "text": text_content})
+                                                # Skip image parts from history - we'll add the main image to the user message
+                                elif any(key in msg for key in ['text', 'message', 'value']):
+                                    # Handle other message formats
+                                    text_content = msg.get('text') or msg.get('message') or msg.get('value', '')
+                                    if text_content:
+                                        text_content = replace_usernames_in_text(str(text_content))
+                                        content_parts.append({"type": "text", "text": text_content})
+                                
+                                if content_parts:
+                                    conversation.append({
+                                        "role": role,
+                                        "content": content_parts
+                                    })
+                    
+                    # If no conversation history was parsed or it's empty, create a simple user message
+                    if not conversation:
+                        # Use the raw input data as text
+                        text_content = replace_usernames_in_text(str(input_data).strip())
+                        conversation.append({
+                            "role": "user",
+                            "content": [{"type": "text", "text": text_content}]
+                        })
+                    
+                    # Add the image to the last user message (or create one if none exists)
+                    user_msg_found = False
+                    for i in range(len(conversation) - 1, -1, -1):
+                        if conversation[i]["role"] == "user":
+                            # Add image to this user message
+                            conversation[i]["content"].append({"type": "image", "image": image_dict})
+                            user_msg_found = True
+                            break
+                    
+                    if not user_msg_found:
+                        # No user message found, create one with just the image
+                        conversation.append({
+                            "role": "user", 
+                            "content": [{"type": "image", "image": image_dict}]
+                        })
+                    
+                    # Add the assistant response
+                    response_text = replace_usernames_in_text(response)
+                    conversation.append({
+                        "role": "assistant",
+                        "content": [{"type": "text", "text": response_text}]
+                    })
+                    
+                except json.JSONDecodeError:
+                    # If input_data is not valid JSON, create simple conversation
+                    text_content = replace_usernames_in_text(str(input_data).strip())
+                    response_text = replace_usernames_in_text(response)
+                    
+                    conversation = [
+                        {
+                            "role": "user",
+                            "content": [
+                                {"type": "text", "text": text_content},
+                                {"type": "image", "image": image_dict}
+                            ]
+                        },
+                        {
+                            "role": "assistant", 
+                            "content": [{"type": "text", "text": response_text}]
+                        }
+                    ]
+                except Exception as e:
+                    logger.debug(f"Line {line_num}: Error parsing conversation history: {e}")
+                    # Fallback to simple conversation
+                    text_content = replace_usernames_in_text(str(input_data).strip())
+                    response_text = replace_usernames_in_text(response)
+                    
+                    conversation = [
+                        {
+                            "role": "user",
+                            "content": [
+                                {"type": "text", "text": text_content},
+                                {"type": "image", "image": image_dict}
+                            ]
+                        },
+                        {
+                            "role": "assistant",
+                            "content": [{"type": "text", "text": response_text}]
+                        }
+                    ]
+                
+                vision_data.append(conversation)
+                
+            except json.JSONDecodeError as e:
+                logger.warning(f"Line {line_num}: JSON decode error: {e}")
+                continue
+            except Exception as e:
+                logger.warning(f"Line {line_num}: Unexpected error: {e}")
+                continue
+    
+    logger.info(f"Successfully processed {len(vision_data)} vision entries")
+    return vision_data
+
+def extract_vision_conversations_from_csv(csv_input: str) -> List[Dict]:
+    """Extract vision data from CSV with input,image,output columns"""
+    if not os.path.isfile(csv_input):
+        logger.debug(f"Vision CSV file not found: {csv_input}")
+        return []
+    
+    logger.info(f"Reading Vision CSV: {csv_input}")
+    
+    try:
+        df = pd.read_csv(csv_input)
+        required_columns = ['input', 'image', 'output']
+        
+        if not all(col in df.columns for col in required_columns):
+            logger.debug(f"Vision CSV missing required columns: {required_columns}")
+            return []
+        
+        vision_data = []
+        
+        for idx, row in df.iterrows():
+            try:
+                input_text = str(row['input']).strip()
+                image_b64 = str(row['image']).strip()
+                output_text = str(row['output']).strip()
+                
+                if not all([input_text, image_b64, output_text]):
+                    continue
+                
+                # Check for bad outputs
+                if output_text in BAD_OUTPUTS:
+                    continue
+                
+                # Create separate replacements for each row
+                row_conversation_replacements = {}
+                
+                # Replace usernames consistently within this single row
+                def replace_usernames_in_text(text: str) -> str:
+                    global username_replaced_count
+                    global available_minecraft_usernames
+                    replaced = False
+
+                    if not MINECRAFT_USERNAMES:
+                        return text
+
+                    for orig_name in ORIGINAL_USERNAMES:
+                        if orig_name in text:
+                            if orig_name not in row_conversation_replacements:
+                                if not available_minecraft_usernames:
+                                    available_minecraft_usernames = list(MINECRAFT_USERNAMES)
+                                replacement = random.choice(available_minecraft_usernames)
+                                available_minecraft_usernames.remove(replacement)
+                                row_conversation_replacements[orig_name] = replacement
+                                replaced = True
+                            text = text.replace(orig_name, row_conversation_replacements[orig_name])
+
+                    if replaced:
+                        username_replaced_count += 1
+                    return text
+                
+                input_text = replace_usernames_in_text(input_text)
+                output_text = replace_usernames_in_text(output_text)
+                
+                # Load image from base64
+                image = load_image_from_base64(image_b64)
+                
+                # Convert PIL image to parquet-compatible dict
+                image_filename_for_dict = f"image_from_base64_{idx}.png" # Create a placeholder filename
+                image_dict = pil_image_to_parquet_dict(image, image_filename_for_dict)
+
+                # Create conversation in unsloth format
+                conversation = [
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "text", "text": input_text},
+                            {"type": "image", "image": image_dict}
+                        ]
+                    },
+                    {
+                        "role": "assistant",
+                        "content": [{"type": "text", "text": output_text}]
+                    }
+                ]
+                
+                vision_data.append(conversation)
+                
+            except Exception as e:
+                logger.warning(f"Row {idx}: Error processing vision data: {e}")
+                continue
+        
+        logger.info(f"Successfully processed {len(vision_data)} vision entries from CSV")
+        return vision_data
+        
+    except Exception as e:
+        logger.error(f"Error reading vision CSV {csv_input}: {e}")
+        return []
+
+def extract_conversations_from_csv(csv_input: str) -> List[List[Dict[str, str]]]:
+    if not os.path.isfile(csv_input):
+        logger.debug(f"CSV file not found: {csv_input}")
+        return []
+    
+    logger.info(f"Reading CSV: {csv_input}")
+    valid_rows = []
+    extra_issue_rows = 0
+    total_extra_columns = 0
+
+    with open(csv_input, newline='', encoding="utf-8") as csvfile:
+        reader = csv.reader(csvfile)
+        try:
+            header = next(reader)
+        except StopIteration:
+            logger.debug(f"CSV file {csv_input} is empty.")
+            return []
+        
+        header_expected = {"input", "output"}
+        header_map = {col: idx for idx, col in enumerate(header)}
+        if not header_expected.issubset(set(header)):
+            logger.debug(f"CSV header does not contain required columns: {header_expected}")
+            return []
+        
+        for idx, row in enumerate(reader, start=2):
+            non_empty_count = sum(1 for field in row if field.strip() != "")
+            if non_empty_count > 2:
+                extra = non_empty_count - 2
+                extra_issue_rows += 1
+                total_extra_columns += extra
+                logger.info(f"Row {idx} has {extra} extra filled column(s); row skipped.")
+                continue
+            row_dict = {col: row[header_map[col]] if header_map[col] < len(row) else "" for col in header_expected}
+            valid_rows.append(row_dict)
+    
+    logger.info(f"Excluded {extra_issue_rows} row(s) with extra columns (total extra columns: {total_extra_columns}).")
+    df = pd.DataFrame(valid_rows)
+    conversations = []
+    for idx, row in df.iterrows():
+        conv = create_conversation_thread(row)
+        if conversation_has_bad_output(conv):
+            continue
+        conversations.append(conv)
+    return conversations
+
+def extract_vision_conversations_from_csv(csv_input: str) -> List[Dict]:
+    """Extract vision data from CSV with input,image,output columns"""
+    if not os.path.isfile(csv_input):
+        logger.debug(f"Vision CSV file not found: {csv_input}")
+        return []
+    
+    logger.info(f"Reading Vision CSV: {csv_input}")
+    
+    try:
+        df = pd.read_csv(csv_input)
+        required_columns = ['input', 'image', 'output']
+        
+        if not all(col in df.columns for col in required_columns):
+            logger.debug(f"Vision CSV missing required columns: {required_columns}")
+            return []
+        
+        vision_data = []
+        
+        for idx, row in df.iterrows():
+            try:
+                input_text = str(row['input']).strip()
+                image_b64 = str(row['image']).strip()
+                output_text = str(row['output']).strip()
+                
+                if not all([input_text, image_b64, output_text]):
+                    continue
+                
+                # Check for bad outputs
+                if output_text in BAD_OUTPUTS:
+                    continue
+                
+                # Create separate replacements for each row
+                row_conversation_replacements = {}
+                
+                # Replace usernames consistently within this single row
+                def replace_usernames_in_text(text: str) -> str:
+                    global username_replaced_count
+                    global available_minecraft_usernames
+                    replaced = False
+
+                    if not MINECRAFT_USERNAMES:
+                        return text
+
+                    for orig_name in ORIGINAL_USERNAMES:
+                        if orig_name in text:
+                            if orig_name not in row_conversation_replacements:
+                                if not available_minecraft_usernames:
+                                    available_minecraft_usernames = list(MINECRAFT_USERNAMES)
+                                replacement = random.choice(available_minecraft_usernames)
+                                available_minecraft_usernames.remove(replacement)
+                                row_conversation_replacements[orig_name] = replacement
+                                replaced = True
+                            text = text.replace(orig_name, row_conversation_replacements[orig_name])
+
+                    if replaced:
+                        username_replaced_count += 1
+                    return text
+                
+                input_text = replace_usernames_in_text(input_text)
+                output_text = replace_usernames_in_text(output_text)
+                
+                # Load image from base64
+                image = load_image_from_base64(image_b64)
+                
+                # Convert PIL image to parquet-compatible dict
+                image_filename_for_dict = f"image_from_base64_{idx}.png" # Create a placeholder filename
+                image_dict = pil_image_to_parquet_dict(image, image_filename_for_dict)
+
+                # Create conversation in unsloth format
+                conversation = [
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "text", "text": input_text},
+                            {"type": "image", "image": image_dict}
+                        ]
+                    },
+                    {
+                        "role": "assistant",
+                        "content": [{"type": "text", "text": output_text}]
+                    }
+                ]
+                
+                vision_data.append(conversation)
+                
+            except Exception as e:
+                logger.warning(f"Row {idx}: Error processing vision data: {e}")
+                continue
+        
+        logger.info(f"Successfully processed {len(vision_data)} vision entries from CSV")
+        return vision_data
+        
+    except Exception as e:
+        logger.error(f"Error reading vision CSV {csv_input}: {e}")
+        return []
+
+def extract_conversations_from_json(json_input: str) -> List[List[Dict[str, str]]]:
+    logger.info(f"Reading JSON: {json_input}")
+    try:
+        with open(json_input, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+    except Exception as e:
+        logger.debug(f"Error reading {json_input}: {e}")
+        return []
+    conversations = []
+    for conv in data:
+        messages = []
+        if "system" in conv and conv["system"]:
+            system_text = str(conv["system"]).strip()
+            system_text = replace_reasoning_prompt(system_text)
+            messages.append({"from": "human", "value": system_text})
+        if "user" in conv and conv["user"]:
+            user_text = str(conv["user"]).strip()
+            user_text = replace_reasoning_prompt(user_text)
+            messages.append({"from": "human", "value": user_text})
+        if "assistant" in conv and conv["assistant"]:
+            assistant_text = str(conv["assistant"]).strip()
+            assistant_text = replace_reasoning_prompt(assistant_text)
+            messages.append({"from": "gpt", "value": assistant_text})
+        if messages and not conversation_has_bad_output(messages):
+            conversations.append(messages)
+    return conversations
+
+if __name__ == "__main__":
+    # Handle vision dataset processing
+    if '--vision' in sys.argv:
+        if not PANDAS_IMAGE_METHODS_AVAILABLE:
+            logger.error("pandas-image-methods is required for --vision flag. Install with: pip install pandas-image-methods")
+            sys.exit(1)
+        
+        # Look for vision data files
+        vision_files = []
+        
+        # Check for HuggingFace format (metadata.jsonl)
+        metadata_jsonl = "vision_dataset/metadata.jsonl"
+        if os.path.isfile(metadata_jsonl):
+            vision_files.append((metadata_jsonl, 'jsonl'))
+        
+        # Check for CSV format vision logs
+        vision_csv = "vision_logs.csv"
+        if os.path.isfile(vision_csv):
+            vision_files.append((vision_csv, 'csv'))
+        
+        # Check for numbered files
+        i = 1
+        while True:
+            jsonl_file = f"vision_dataset{i}/metadata.jsonl"
+            csv_file = f"vision_logs{i}.csv"
+            found_any = False
+            
+            if os.path.isfile(jsonl_file):
+                vision_files.append((jsonl_file, 'jsonl'))
+                found_any = True
+            if os.path.isfile(csv_file):
+                vision_files.append((csv_file, 'csv'))
+                found_any = True
+                
+            if not found_any:
+                break
+            i += 1
+        
+        if not vision_files:
+            logger.error("No vision dataset files found for --vision flag!")
+            logger.info("Looking for:")
+            logger.info("  - vision_dataset/metadata.jsonl (HuggingFace format)")
+            logger.info("  - vision_logs.csv (CSV format)")
+            logger.info("  - vision_datasetN/metadata.jsonl")
+            logger.info("  - vision_logsN.csv")
+            sys.exit(1)
+        
+        logger.info(f"Found {len(vision_files)} vision files: {[f for f, _ in vision_files]}")
+        
+        # Process all vision files
+        all_vision_data = []
+        total_count = 0
+        file_counts = {}
+        
+        for file_path, file_type in vision_files:
+            if file_type == 'jsonl':
+                vision_data = extract_vision_data_from_jsonl(file_path)
+            else:  # csv
+                vision_data = extract_vision_conversations_from_csv(file_path)
+            
+            file_counts[file_path] = len(vision_data)
+            all_vision_data.extend(vision_data)
+            total_count += len(vision_data)
+        
+        if not all_vision_data:
+            logger.error("No valid vision data found!")
+            sys.exit(1)
+        
+        # Check for tokenization flags
+        do_tokenize = '--tokenize' in sys.argv
+        tokenizer = None
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        if do_tokenize:
+            logger.info("Loading tokenizer 'unsloth/Llama-3.2-1B-Instruct-bnb-4bit'...")
+            tokenizer = AutoTokenizer.from_pretrained("unsloth/Llama-3.2-1B-Instruct-bnb-4bit")
+        
+        # Tokenize if requested
+        if do_tokenize and tokenizer:
+            all_texts = []
+            for entry in all_vision_data:
+                all_texts.append(entry['input'])
+                all_texts.append(entry['output'])
+            
+            total_tokens = 0
+            logger.info("Tokenizing vision data...")
+            for text in tqdm(all_texts, desc="Tokenizing", unit="msg"):
+                encoded = tokenizer(text, return_tensors="pt")
+                input_ids = encoded["input_ids"].to(device)
+                total_tokens += input_ids.shape[-1]
+            logger.info(f"Total tokens across all vision data: {total_tokens}")
+        
+        # Remove duplicates based on conversation content
+        unique_vision_data = []
+        seen_keys = set()
+        
+        for conversation in all_vision_data:
+            # Create a key from the text content of the conversation
+            key_parts = []
+            for msg in conversation:
+                if msg["role"] in ["user", "assistant"]:
+                    for content_part in msg["content"]:
+                        if content_part["type"] == "text":
+                            key_parts.append(content_part["text"].strip())
+            
+            key = tuple(key_parts)
+            if key not in seen_keys:
+                seen_keys.add(key)
+                unique_vision_data.append(conversation)
+        
+        all_vision_data = unique_vision_data
+        logger.info(f"After deduplication: {len(all_vision_data)} unique vision conversations")
+        
+        # Shuffle the data
+        random.shuffle(all_vision_data)
+        
+        # Images are already in parquet-compatible dict format within all_vision_data
+        # No further image processing needed here before creating DataFrame
+        
+        # Create DataFrame with conversations column (unsloth format)
+        df_final = pd.DataFrame({"conversations": all_vision_data})
+        
+        output_parquet = "Andy_vision_conversations.parquet"
+        
+        logger.info(f"Writing vision dataset to {output_parquet}")
+        try:
+            df_final.to_parquet(output_parquet, index=False)
+            abs_path = os.path.abspath(output_parquet)
+            logger.info(f"Successfully wrote vision dataset to: {abs_path}")
+        except Exception as e:
+            logger.error(f"Error writing Parquet file: {e}")
+            sys.exit(1)
+        
+        logger.info(
+            f"\n"
+            f"--------------------------------------------------------------------------------------\n"
+            f"Vision conversion complete! Processed {total_count} vision conversations from {len(vision_files)} files.\n"
+            f"Replaced {username_replaced_count} usernames across conversations.\n"
+            f"Total usernames available: {len(MINECRAFT_USERNAMES)}\n"
+            f"Final dataset size: {len(all_vision_data)} unique conversations\n"
+            f"--------------------------------------------------------------------------------------\n"
+        )
+        
+        # Log counts per file
+        for file_path, count in file_counts.items():
+            logger.info(f"File '{file_path}' contributed {count} conversations.")
+        
+        sys.exit(0)
+    
+    # Regular processing for non-vision data
+    base_filename = "Andy_pre"
+    files = []
+    i = 1
+    while True:
+        csv_file = f"{base_filename}{i}.csv"
+        json_file = f"{base_filename}{i}.json"
+        if not os.path.isfile(csv_file) and not os.path.isfile(json_file):
+            break
+        if os.path.isfile(csv_file):
+            files.append((csv_file, 'csv'))
+        if os.path.isfile(json_file):
+            files.append((json_file, 'json'))
+        i += 1
+
+    if not files:
+        logger.info("No CSV or JSON files found with pattern Andy_preN.(csv|json)")
+        sys.exit(1)
+
+    # Check for tokenization flags
+    do_tokenize = '--tokenize' in sys.argv
+    do_tokenize_largest = '--tokenize_largest' in sys.argv
+    tokenizer = None
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    if do_tokenize or do_tokenize_largest:
+        logger.info("Loading tokenizer 'unsloth/Llama-3.2-1B-Instruct-bnb-4bit'...")
+        tokenizer = AutoTokenizer.from_pretrained("unsloth/Llama-3.2-1B-Instruct-bnb-4bit")
+
+    logger.info(f"Found {len(files)} files: {[f for f, _ in files]}")
+    combined_conversations = []
+    total_count = 0
+    file_conversation_counts = {}
+
+    for file, ftype in files:
+        if ftype == 'csv':
+            convs = extract_conversations_from_csv(file)
+        else:
+            convs = extract_conversations_from_json(file)
+        file_conversation_counts[file] = len(convs)
+        combined_conversations.extend(convs)
+        total_count += len(convs)
+
+    # Tokenize all data and count tokens
+    if do_tokenize:
+        all_texts = [msg["value"] for conv in combined_conversations for msg in conv]
+        total_tokens = 0
+        logger.info("Tokenizing all data with progress bar and GPU acceleration...")
+        for text in tqdm(all_texts, desc="Tokenizing", unit="msg"):
+            encoded = tokenizer(text, return_tensors="pt")
+            input_ids = encoded["input_ids"].to(device)
+            total_tokens += input_ids.shape[-1]
+        logger.info(f"Total tokens across all data: {total_tokens}")
+
+    # Tokenize 5 largest conversations
+    if do_tokenize_largest:
+        conv_token_counts = []
+        logger.info("Tokenizing largest conversations with progress bar and GPU acceleration...")
+        for conv in tqdm(combined_conversations, desc="Tokenizing convs", unit="conv"):
+            text = "\n".join(msg["value"] for msg in conv)
+            encoded = tokenizer(text, return_tensors="pt")
+            input_ids = encoded["input_ids"].to(device)
+            conv_token_counts.append((input_ids.shape[-1], conv))
+        # sort and take top 5
+        conv_token_counts.sort(key=lambda x: x[0], reverse=True)
+        top5 = conv_token_counts[:5]
+        max_tokens = max(count for count, _ in top5)
+        for idx, (count, _) in enumerate(top5, 1):
+            logger.info(f"Top {idx} conversation tokens: {count}")
+        logger.info(f"Maximum tokens in top 5: {max_tokens}")
+
+    # Clean up GPT messages
+    for conv in combined_conversations:
+        for msg in conv:
+            if msg["from"] == "gpt":
+                msg["value"] = msg["value"].replace("<think>\nundefined</think>\n", "").replace("<think>\nundefined</think>", "").strip()
+
+    unique_conversations = []
+    seen_keys = set()
+    for conv in combined_conversations:
+        if len(conv) < 2:
+            key = tuple(msg["value"] for msg in conv)
+        else:
+            key = (conv[0]["value"].strip(), conv[-1]["value"].strip())
+        if key not in seen_keys:
+            seen_keys.add(key)
+            unique_conversations.append(conv)
+    combined_conversations = unique_conversations
+
+    random.shuffle(combined_conversations)
+
+    # Handle codeOnly flag
+    if '--codeOnly' in sys.argv:
+        coding = []
+        noncoding = []
+        for conv in combined_conversations:
+            has_code = any("```" in msg["value"] for msg in conv) or (
+                conv and conv[-1]["from"] == "gpt" and "!newAction(" in conv[-1]["value"]
+            )
+            if has_code:
+                coding.append(conv)
+            else:
+                noncoding.append(conv)
+        logger.info(f"Found {len(coding)} coding examples and {len(noncoding)} non-coding examples.")
+        noncoding_count = int(round(0.15 * len(coding)))
+        if noncoding_count > len(noncoding):
+            noncoding_count = len(noncoding)
+        selected_noncoding = random.sample(noncoding, noncoding_count) if noncoding_count > 0 else []
+        final_conversations = coding + selected_noncoding
+        random.shuffle(final_conversations)
+        combined_conversations = final_conversations
+
+    if '--codeOnly' in sys.argv:
+        df_final = pd.DataFrame({"conversations": combined_conversations})
+        output_parquet = "Andy_conversations_codeOnly.parquet"
+    else:
+        df_final = pd.DataFrame({"conversations": combined_conversations})
+        output_parquet = "Andy_conversations.parquet"
+
+    logger.info(f"Writing output to {output_parquet}")
+    try:
+        df_final.to_parquet(output_parquet, index=False)
+        abs_path = os.path.abspath(output_parquet)
+        logger.info(f"Successfully wrote output to: {abs_path}")
+    except Exception as e:
+        logger.debug(f"Error writing Parquet file: {e}")
+        sys.exit(1)
+
+    logger.info(
+        f"\n"
+        f"--------------------------------------------------------------------------------------\n\n"
+        f"Conversion complete! Processed {total_count} conversations from {len(files)} files. \n"
+        f"Replaced {username_replaced_count} usernames across {total_count} conversations. \n"
+        f"Total amount of usernames to choose from: {len(MINECRAFT_USERNAMES)} (removed {duplicate_count} duplicates) \n"
+        f"--------------------------------------------------------------------------------------\n\n"
+    )
+
+    # Log conversation counts per file.
+    for file, count in file_conversation_counts.items():
+        logger.info(f"File '{file}' contributed {count} conversations.")
--- a/logs/generate_usernames.py
+++ b/logs/generate_usernames.py
--- a/logs/requirements.txt
+++ b/logs/requirements.txt
@ -1 +1,18 @@
+# Core dependencies for convert.py
+pandas>=1.3.0
+pandas-image-methods>=0.2.0
+transformers>=4.20.0
+torch>=1.12.0
+tqdm>=4.64.0
+pillow>=9.0.0
+pyarrow>=10.0.0
+
+# Optional dependencies for enhanced functionality
+datasets>=2.0.0
+dask[complete]>=2022.7.0
+distributed>=2022.7.0
+
+# Additional utility dependencies
+numpy>=1.21.0
+requests>=2.25.0