When Azure OpenAI breaks in production, you need telemetry that shows what's actually wrong, not Microsoft's vague "something happened" messages. Here's how to build monitoring that saves your ass at 3am.
The Monitoring Stack That Works
Azure Monitor is garbage for OpenAI debugging. The default metrics tell you nothing useful - request counts and latency percentiles don't help when you're troubleshooting why specific calls are failing.
Build custom logging that captures:
- Full request/response pairs (sanitized for PII)
- Token counts per request (Azure's billing can surprise you)
- Model-specific error rates (some models fail more than others)
- Regional failure patterns (East US 2 fails differently than West Europe)
import logging
import json
from datetime import datetime
class AzureOpenAILogger:
def __init__(self):
self.logger = logging.getLogger('azure_openai')
def log_request(self, request, response, error=None):
log_data = {
'timestamp': datetime.utcnow().isoformat(),
'model': request.get('model'),
'region': self.extract_region_from_endpoint(request.get('endpoint')),
'input_tokens': response.get('usage', {}).get('prompt_tokens', 0),
'output_tokens': response.get('usage', {}).get('completion_tokens', 0),
'latency_ms': response.get('latency_ms'),
'error_code': error.get('code') if error else None,
'error_message': error.get('message') if error else None,
'retry_after': error.get('headers', {}).get('retry-after') if error else None
}
if error:
self.logger.error(f"Azure OpenAI Error: {json.dumps(log_data)}")
else:
self.logger.info(f"Azure OpenAI Success: {json.dumps(log_data)}")
Real-Time Error Detection
Azure's built-in alerting sucks. By the time their alerts fire, your users have been suffering for 10+ minutes. Build proactive detection:
Circuit Breaker Pattern
Kill connections before they cascade:
class AzureOpenAICircuitBreaker:
def __init__(self, failure_threshold=5, timeout=60):
self.failure_count = 0
self.failure_threshold = failure_threshold
self.timeout = timeout
self.last_failure_time = None
self.state = 'CLOSED' # CLOSED, OPEN, HALF_OPEN
def call(self, func):
if self.state == 'OPEN':
if time.time() - self.last_failure_time > self.timeout:
self.state = 'HALF_OPEN'
else:
raise Exception("Circuit breaker is OPEN")
try:
result = func()
if self.state == 'HALF_OPEN':
self.state = 'CLOSED'
self.failure_count = 0
return result
except Exception as e:
self.failure_count += 1
self.last_failure_time = time.time()
if self.failure_count >= self.failure_threshold:
self.state = 'OPEN'
raise
Health Check Endpoints
Don't wait for user complaints:
async def health_check():
"""Test all your Azure OpenAI deployments every 30 seconds"""
deployments = [
{'name': 'gpt-4o-prod', 'endpoint': 'https://eastus2.openai.azure.com'},
{'name': 'gpt-35-fallback', 'endpoint': 'https://westeurope.openai.azure.com'}
]
health_status = {}
for deployment in deployments:
try:
# Simple test call
response = await openai.ChatCompletion.acreate(
engine=deployment['name'],
messages=[{"role": "user", "content": "test"}],
max_tokens=1,
timeout=5 # Fail fast
)
health_status[deployment['name']] = {
'status': 'healthy',
'latency': response.get('latency_ms'),
'tokens_used': response['usage']['total_tokens']
}
except Exception as e:
health_status[deployment['name']] = {
'status': 'failed',
'error': str(e)
}
return health_status
Performance Debugging
Token Usage Tracking
The most expensive debugging you'll ever do:
Azure bills by tokens, but their usage reporting lags by hours. Track this yourself or watch your budget explode:
class TokenTracker:
def __init__(self):
self.daily_usage = {}
def track_usage(self, model, input_tokens, output_tokens):
today = datetime.now().date().isoformat()
if today not in self.daily_usage:
self.daily_usage[today] = {}
if model not in self.daily_usage[today]:
self.daily_usage[today][model] = {'input': 0, 'output': 0, 'cost': 0}
self.daily_usage[today][model]['input'] += input_tokens
self.daily_usage[today][model]['output'] += output_tokens
# Calculate cost (prices as of Aug 2025)
costs = {
'gpt-4o': {'input': 0.03, 'output': 0.06}, # per 1K tokens
'gpt-35-turbo': {'input': 0.002, 'output': 0.002}
}
if model in costs:
input_cost = (input_tokens / 1000) * costs[model]['input']
output_cost = (output_tokens / 1000) * costs[model]['output']
self.daily_usage[today][model]['cost'] += input_cost + output_cost
Regional Performance Monitoring
Because Azure regions are not created equal:
def monitor_regional_performance():
"""Track which regions are actually fast vs which are lying"""
regions = {
'eastus2': 'https://eastus2.openai.azure.com',
'westeurope': 'https://westeurope.openai.azure.com',
'swedencentral': 'https://swedencentral.openai.azure.com'
}
performance_data = {}
for region, endpoint in regions.items():
start_time = time.time()
try:
# Test identical request across regions
response = requests.post(f"{endpoint}/openai/deployments/gpt-4o/chat/completions",
headers={'Authorization': f'Bearer {get_token()}'},
json={
'messages': [{'role': 'user', 'content': 'Hello'}],
'max_tokens': 10
}
)
latency = (time.time() - start_time) * 1000
performance_data[region] = {
'latency_ms': latency,
'status': 'healthy' if response.status_code == 200 else 'degraded',
'tokens_per_second': 10 / (latency / 1000) if latency > 0 else 0
}
except Exception as e:
performance_data[region] = {
'latency_ms': None,
'status': 'failed',
'error': str(e)
}
return performance_data
Log Analysis for Common Patterns
Pattern Recognition
Spot the problems before they become disasters:
def analyze_failure_patterns(logs):
"""Find patterns in your failures that Microsoft won't tell you about"""
patterns = {
'dns_timeouts': 0,
'quota_exceeded': 0,
'model_unavailable': 0,
'authentication_expired': 0
}
for log_entry in logs:
error_msg = log_entry.get('error_message', '').lower()
if 'timeout' in error_msg and 'dns' in error_msg:
patterns['dns_timeouts'] += 1
elif 'rate limit' in error_msg or '429' in error_msg:
patterns['quota_exceeded'] += 1
elif 'deployment not found' in error_msg:
patterns['model_unavailable'] += 1
elif 'forbidden' in error_msg or '403' in error_msg:
patterns['authentication_expired'] += 1
# Alert if any pattern is increasing
for pattern, count in patterns.items():
if count > 10: # Threshold based on your volume
send_alert(f"Pattern detected: {pattern} occurred {count} times")
return patterns
Cost Spike Detection
Because Azure billing surprises are never good surprises:
def detect_cost_spikes(current_usage, historical_average):
"""Alert when token usage exceeds normal patterns by 50%+"""
spike_threshold = 1.5 # 50% increase
alerts = []
for model, usage in current_usage.items():
if model in historical_average:
avg_daily_cost = historical_average[model]['daily_cost']
current_cost = usage['cost']
if current_cost > (avg_daily_cost * spike_threshold):
spike_percentage = ((current_cost - avg_daily_cost) / avg_daily_cost) * 100
alerts.append({
'model': model,
'spike_percentage': spike_percentage,
'current_cost': current_cost,
'expected_cost': avg_daily_cost
})
return alerts
The monitoring setup takes 2-3 hours to implement but will save you weeks of debugging time. Don't rely on Azure's built-in monitoring - it's designed for Microsoft's convenience, not your troubleshooting needs.