Forget the perfect tutorials. Here's how to build something that actually works, with all the ugly error handling and resource management you need.
Phase 1: Get Something Working (Don't Optimize Yet)
Start stupidly simple. One modality, basic error handling, no fancy architecture. Get text processing working first:
import os
import asyncio
import tempfile
import shutil
from typing import Optional, Dict, Any
from pathlib import Path
## Modern LangChain imports (not the deprecated stuff)
from langchain.chat_models import init_chat_model
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.tools import tool
## Set up the model (GPT-4o handles vision and text)
model = init_chat_model(
"gpt-4o",
model_provider="openai",
temperature=0,
max_tokens=1000
)
@tool
def analyze_text(text: str) -> str:
"""Process text input with proper error handling."""
try:
messages = [
SystemMessage(content="You are a helpful multi-modal AI assistant."),
HumanMessage(content=text)
]
response = model.invoke(messages)
return response.content
except Exception as e:
return f"Text processing failed: {str(e)}"
Add image processing next. This is where things get fun (read: painful):
import base64
from PIL import Image
import io
@tool
def analyze_image(image_path: str) -> str:
"""Analyze image with proper error handling and memory management."""
try:
# Check file exists and is readable
if not os.path.exists(image_path):
return f"Image file not found: {image_path}"
# Check file size (don't process huge files)
file_size = os.path.getsize(image_path)
if file_size > 20 * 1024 * 1024: # 20MB limit
return "Image too large - max 20MB"
# Open and validate image
try:
with Image.open(image_path) as img:
# Convert to RGB if needed (fixes RGBA issues)
if img.mode != 'RGB':
img = img.convert('RGB')
# Resize if too large (OpenAI has limits)
max_dimension = 2048
if max(img.width, img.height) > max_dimension:
img.thumbnail((max_dimension, max_dimension), Image.Resampling.LANCZOS)
# Convert to base64
buffer = io.BytesIO()
img.save(buffer, format='JPEG', quality=85)
image_data = base64.b64encode(buffer.getvalue()).decode('utf-8')
except Exception as e:
return f"Image processing failed: {str(e)}"
# Send to OpenAI Vision API
messages = [
HumanMessage(content=[
{"type": "text", "text": "Analyze this image in detail."},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{image_data}"}
}
])
]
response = model.invoke(messages)
return response.content
except Exception as e:
return f"Vision analysis failed: {str(e)}"
Audio processing is a nightmare. Here's how to make it less terrible:
import subprocess
import whisper
from pydub import AudioSegment
## Load Whisper model once (not per request)
whisper_model = None
def get_whisper_model():
"""Lazy load Whisper model to save memory."""
global whisper_model
if whisper_model is None:
# Use 'base' model - good balance of speed/accuracy
whisper_model = whisper.load_model("base")
return whisper_model
@tool
def transcribe_audio(audio_path: str) -> str:
"""Transcribe audio with format conversion and error handling."""
temp_dir = None
try:
# Check file exists
if not os.path.exists(audio_path):
return f"Audio file not found: {audio_path}"
# Create temp directory
temp_dir = tempfile.mkdtemp()
converted_path = os.path.join(temp_dir, "converted.wav")
# Convert audio to WAV format (Whisper likes this)
try:
audio = AudioSegment.from_file(audio_path)
# Limit length (don't process hours of audio)
max_duration = 5 * 60 * 1000 # 5 minutes in milliseconds
if len(audio) > max_duration:
audio = audio[:max_duration]
print(f"Audio truncated to 5 minutes")
# Export as WAV
audio.export(converted_path, format="wav", parameters=["-ac", "1", "-ar", "16000"])
except Exception as e:
return f"Audio conversion failed: {str(e)}"
# Transcribe with Whisper
try:
model = get_whisper_model()
result = model.transcribe(converted_path)
text = result["text"].strip()
if not text:
return "No speech detected in audio"
return f"Transcription: {text}"
except Exception as e:
return f"Transcription failed: {str(e)}"
except Exception as e:
return f"Audio processing failed: {str(e)}"
finally:
# Clean up temp files
if temp_dir and os.path.exists(temp_dir):
shutil.rmtree(temp_dir, ignore_errors=True)
Phase 2: Multi-Modal Coordination (Where It Gets Real)
Don't try to be clever with fusion. Process each modality separately, then combine results:
async def process_multimodal_input(
text: Optional[str] = None,
image_path: Optional[str] = None,
audio_path: Optional[str] = None
) -> Dict[str, Any]:
"""Process multiple modalities with timeout and resource limits."""
results = {
"text": None,
"image": None,
"audio": None,
"combined": None,
"errors": []
}
# Create processing tasks
tasks = []
if text:
tasks.append(("text", analyze_text.ainvoke({"text": text})))
if image_path:
tasks.append(("image", analyze_image.ainvoke({"image_path": image_path})))
if audio_path:
tasks.append(("audio", transcribe_audio.ainvoke({"audio_path": audio_path})))
if not tasks:
return {"error": "No input provided"}
# Process with timeout
try:
# Wait for all tasks with 60 second timeout
completed = await asyncio.wait_for(
asyncio.gather(*[task[1] for task in tasks], return_exceptions=True),
timeout=60.0
)
# Collect results
for i, (modality, result) in enumerate(zip([task[0] for task in tasks], completed)):
if isinstance(result, Exception):
results["errors"].append(f"{modality}: {str(result)}")
else:
results[modality] = result
# Simple combination - just concatenate results
valid_results = [v for k, v in results.items()
if k not in ["combined", "errors"] and v is not None]
if valid_results:
results["combined"] = "
".join([
f"**{modality.title()} Analysis:**
{result}"
for modality, result in zip([task[0] for task in tasks], completed)
if not isinstance(result, Exception)
])
return results
except asyncio.TimeoutError:
return {"error": "Processing timeout - inputs too large"}
except Exception as e:
return {"error": f"Processing failed: {str(e)}"}
Phase 3: Production Realities
Resource monitoring is mandatory:
import psutil
import gc
import time
from contextlib import contextmanager
@contextmanager
def resource_monitor(operation_name: str):
"""Monitor resource usage and clean up aggressively."""
start_time = time.time()
start_memory = psutil.virtual_memory().percent
try:
yield
finally:
# Force cleanup
gc.collect()
# Clear GPU cache if available
try:
import torch
if torch.cuda.is_available():
torch.cuda.empty_cache()
except ImportError:
pass
end_time = time.time()
end_memory = psutil.virtual_memory().percent
duration = end_time - start_time
memory_delta = end_memory - start_memory
print(f"{operation_name}: {duration:.2f}s, memory: {memory_delta:+.1f}%")
# Alert on resource issues
if duration > 30:
print(f"WARNING: {operation_name} took {duration:.1f}s")
if end_memory > 85:
print(f"WARNING: High memory usage: {end_memory:.1f}%")
## Usage
async def safe_multimodal_processing(**kwargs):
"""Wrapper with resource monitoring."""
with resource_monitor("multimodal_processing"):
return await process_multimodal_input(**kwargs)
Error recovery patterns:
from tenacity import retry, stop_after_attempt, wait_exponential
@retry(
stop=stop_after_attempt(3),
wait=wait_exponential(multiplier=1, min=2, max=10)
)
async def robust_multimodal_agent(inputs: Dict[str, Any]) -> Dict[str, Any]:
"""Multi-modal agent with retry logic and fallbacks."""
try:
# Try full multi-modal processing
return await safe_multimodal_processing(**inputs)
except Exception as e:
# Fallback to text-only if multi-modal fails
if "text" in inputs:
try:
text_result = await analyze_text.ainvoke({"text": inputs["text"]})
return {
"text": text_result,
"fallback": True,
"error": f"Multi-modal processing failed, used text fallback: {str(e)}"
}
except Exception as text_error:
return {
"error": f"All processing failed: {str(text_error)}"
}
return {"error": f"Processing failed: {str(e)}"}
What You Learn After It Breaks in Production
File handling will destroy you. Users upload 100MB videos, corrupted images, and audio files that crash FFmpeg. Check everything:
def validate_input_file(file_path: str, max_size_mb: int = 50) -> tuple[bool, str]:
"""Validate input file before processing."""
if not os.path.exists(file_path):
return False, "File does not exist"
# Check size
size_mb = os.path.getsize(file_path) / (1024 * 1024)
if size_mb > max_size_mb:
return False, f"File too large: {size_mb:.1f}MB (max {max_size_mb}MB)"
# Check if file is readable
try:
with open(file_path, 'rb') as f:
f.read(1024) # Try to read first 1KB
except Exception as e:
return False, f"File not readable: {str(e)}"
return True, "Valid"
Memory usage grows without bounds. Multi-modal processing accumulates tensors, cached models, and temp files. Monitor and cleanup aggressively or your system dies.
API costs explode fast. GPT-4V costs $0.01 per image. Process 1000 images and you've spent $10. Set budget alerts or go bankrupt.
The key insight: start simple, add complexity only when needed. Your first multi-modal agent should barely work. Make it robust before making it smart.
If you've made it this far, you're probably hitting the same problems everyone hits. Here are the questions I get asked constantly by people building multi-modal agents, with brutally honest answers.