Monitoring isn't about collecting metrics - it's about predicting failures and diagnosing problems when everything's on fire. Here's monitoring that actually helps you fix issues at 3 AM, implementing Django monitoring best practices, production observability, SRE monitoring principles, error tracking systems, Python APM patterns, Django health checks, performance monitoring strategies, memory profiling techniques, and database monitoring best practices.
Most APM tools give you pretty graphs but don't tell you why your app is slow. Here's monitoring based on production memory profiling techniques, Django performance monitoring, Python profiling tools, psutil system monitoring, and tracemalloc memory tracking that actually identifies root causes:
Memory Leak Detection
## Memory monitoring middleware that catches leaks before they kill servers
import tracemalloc
import psutil
import os
from django.core.cache import cache
class MemoryLeakDetector:
def __init__(self, get_response):
self.get_response = get_response
tracemalloc.start()
def __call__(self, request):
# Track memory before request
process = psutil.Process(os.getpid())
initial_memory = process.memory_info().rss / 1024 / 1024 # MB
# Take tracemalloc snapshot
snapshot_before = tracemalloc.take_snapshot()
response = self.get_response(request)
# Check memory after request
final_memory = process.memory_info().rss / 1024 / 1024 # MB
memory_delta = final_memory - initial_memory
# Alert if memory usage is concerning
if memory_delta > 50: # More than 50MB per request
snapshot_after = tracemalloc.take_snapshot()
top_stats = snapshot_after.compare_to(snapshot_before, 'lineno')
leak_details = []
for stat in top_stats[:10]:
leak_details.append(f\"{stat.size_diff / 1024:.1f}KB: {stat.traceback}\")
logger.error(f\"Memory leak detected in {request.path}: \"
f\"Delta: {memory_delta:.1f}MB, Details: {leak_details}\")
# Store for trend analysis
cache_key = f\"memory_trend_{request.path.replace('/', '_')}\"
memory_history = cache.get(cache_key, [])
memory_history.append({
'timestamp': time.time(),
'memory_delta': memory_delta,
'url': request.path
})
# Keep only last 100 entries
if len(memory_history) > 100:
memory_history = memory_history[-100:]
cache.set(cache_key, memory_history, 3600) # 1 hour
return response
Database Query Monitoring
## Database performance tracking that identifies slow queries
class DatabasePerformanceMonitor:
def __init__(self, get_response):
self.get_response = get_response
def __call__(self, request):
from django.db import connection, reset_queries
from django.conf import settings
# Enable query logging temporarily
settings.DEBUG = True
reset_queries()
response = self.get_response(request)
queries = connection.queries
settings.DEBUG = False
# Analyze query performance
slow_queries = []
total_time = 0
for query in queries:
query_time = float(query['time'])
total_time += query_time
if query_time > 0.1: # Queries over 100ms
slow_queries.append({
'sql': query['sql'][:500], # Truncate long queries
'time': query_time,
'url': request.path
})
# Alert on performance issues
if len(queries) > 50: # N+1 query problem
logger.warning(f\"N+1 query detected: {request.path} executed {len(queries)} queries\")
if total_time > 1.0: # Total DB time over 1 second
logger.error(f\"Slow database operations: {request.path} took {total_time:.2f}s in DB\")
# Log slow queries for analysis
for slow_query in slow_queries:
logger.warning(f\"Slow query in {slow_query['url']}: {slow_query['time']:.3f}s - {slow_query['sql']}\")
return response
Error Tracking That Actually Helps Debug Issues
Error messages without context are useless. Here's error tracking that includes the information you need to fix problems:
## Enhanced error reporting
import sentry_sdk
from sentry_sdk.integrations.django import DjangoIntegration
from sentry_sdk.integrations.redis import RedisIntegration
from sentry_sdk.integrations.celery import CeleryIntegration
def before_send(event, hint):
\"\"\"Add context that helps debug production issues\"\"\"
# Add memory usage to error reports
process = psutil.Process(os.getpid())
event['extra']['memory_usage_mb'] = process.memory_info().rss / 1024 / 1024
event['extra']['cpu_percent'] = process.cpu_percent()
# Add database connection info
from django.db import connection
event['extra']['db_queries_count'] = len(connection.queries) if hasattr(connection, 'queries') else 0
# Add active user sessions
from django.contrib.sessions.models import Session
active_sessions = Session.objects.filter(expire_date__gte=timezone.now()).count()
event['extra']['active_sessions'] = active_sessions
# Scrub sensitive data
if 'exception' in hint and hasattr(hint['exception'], 'args'):
args = list(hint['exception'].args)
for i, arg in enumerate(args):
if isinstance(arg, str):
# Remove credit card numbers
args[i] = re.sub(r'\d{13,19}', '[REDACTED]', arg)
# Remove email addresses
args[i] = re.sub(r'[\w\.-]+@[\w\.-]+\.\w+', '[EMAIL]', args[i])
hint['exception'].args = tuple(args)
return event
sentry_sdk.init(
dsn=os.environ.get('SENTRY_DSN'),
integrations=[
DjangoIntegration(transaction_style='url'),
RedisIntegration(),
CeleryIntegration(monitor_beat_tasks=True),
],
traces_sample_rate=0.1, # 10% trace sampling
profiles_sample_rate=0.1, # 10% profiling
before_send=before_send,
environment=os.environ.get('DJANGO_ENV', 'production')
)
Health Checks That Prevent False Alarms
Health checks need to verify actual application health, not just that the process is running:
## Comprehensive health checks
from django.http import JsonResponse
from django.db import connection
from django.core.cache import cache
import redis
import time
def health_check(request):
\"\"\"Comprehensive health check that verifies all critical systems\"\"\"
health_status = {
'status': 'healthy',
'timestamp': time.time(),
'checks': {}
}
# Database connectivity
try:
with connection.cursor() as cursor:
cursor.execute(\"SELECT 1\")
health_status['checks']['database'] = 'healthy'
except Exception as e:
health_status['status'] = 'unhealthy'
health_status['checks']['database'] = f'error: {str(e)}'
# Redis connectivity
try:
cache.set('health_check', 'ok', 10)
if cache.get('health_check') == 'ok':
health_status['checks']['redis'] = 'healthy'
else:
raise Exception(\"Cache write/read failed\")
except Exception as e:
health_status['status'] = 'unhealthy'
health_status['checks']['redis'] = f'error: {str(e)}'
# Memory usage check
process = psutil.Process(os.getpid())
memory_mb = process.memory_info().rss / 1024 / 1024
if memory_mb > 1024: # Over 1GB
health_status['checks']['memory'] = f'warning: {memory_mb:.1f}MB'
if memory_mb > 2048: # Over 2GB
health_status['status'] = 'unhealthy'
health_status['checks']['memory'] = f'critical: {memory_mb:.1f}MB'
else:
health_status['checks']['memory'] = f'healthy: {memory_mb:.1f}MB'
# Disk space check
disk_usage = psutil.disk_usage('/')
disk_percent = (disk_usage.used / disk_usage.total) * 100
if disk_percent > 90:
health_status['status'] = 'unhealthy'
health_status['checks']['disk'] = f'critical: {disk_percent:.1f}% full'
elif disk_percent > 80:
health_status['checks']['disk'] = f'warning: {disk_percent:.1f}% full'
else:
health_status['checks']['disk'] = f'healthy: {disk_percent:.1f}% full'
# Response time check
status_code = 200 if health_status['status'] == 'healthy' else 503
return JsonResponse(health_status, status=status_code)
Collect metrics that help you optimize and scale, not just vanity metrics:
## Performance metrics collection
import time
from django.core.cache import cache
import threading
class PerformanceMetrics:
def __init__(self, get_response):
self.get_response = get_response
self.metrics_lock = threading.Lock()
def __call__(self, request):
start_time = time.time()
response = self.get_response(request)
response_time = time.time() - start_time
# Collect metrics
with self.metrics_lock:
self._record_metric('response_time', response_time, request.path)
self._record_metric('status_code', response.status_code, request.path)
self._record_metric('request_size', len(request.body), request.path)
self._record_metric('response_size', len(response.content), request.path)
return response
def _record_metric(self, metric_name, value, path):
\"\"\"Record metric with time-series data\"\"\"
timestamp = int(time.time() / 60) * 60 # Round to minute
# Store metric data
metric_key = f\"metrics:{metric_name}:{path}:{timestamp}\"
cache.set(metric_key, value, 3600) # Keep for 1 hour
# Update aggregated statistics
stats_key = f\"stats:{metric_name}:{path}\"
stats = cache.get(stats_key, {'count': 0, 'sum': 0, 'min': float('inf'), 'max': 0})
stats['count'] += 1
stats['sum'] += value
stats['min'] = min(stats['min'], value)
stats['max'] = max(stats['max'], value)
stats['avg'] = stats['sum'] / stats['count']
cache.set(stats_key, stats, 3600)
Alerting That Doesn't Cause Alert Fatigue
Good alerts tell you about problems you can fix. Bad alerts wake you up for things you can't control:
## Intelligent alerting system
class IntelligentAlerting:
def __init__(self):
self.alert_thresholds = {
'error_rate': {'warning': 0.01, 'critical': 0.05}, # 1% and 5%
'response_time': {'warning': 2.0, 'critical': 5.0}, # 2s and 5s
'memory_usage': {'warning': 1024, 'critical': 2048}, # 1GB and 2GB
'db_connections': {'warning': 80, 'critical': 95}, # Connection pool %
}
def check_and_alert(self, metric_name, current_value, context=None):
\"\"\"Smart alerting that considers trends and context\"\"\"
thresholds = self.alert_thresholds.get(metric_name)
if not thresholds:
return
# Get historical data for trend analysis
history_key = f\"history:{metric_name}\"
history = cache.get(history_key, [])
history.append({'value': current_value, 'timestamp': time.time()})
# Keep only last hour of data
cutoff_time = time.time() - 3600
history = [h for h in history if h['timestamp'] > cutoff_time]
cache.set(history_key, history, 3600)
# Calculate trend
if len(history) >= 5:
recent_avg = sum(h['value'] for h in history[-5:]) / 5
older_avg = sum(h['value'] for h in history[:-5]) / len(history[:-5]) if len(history) > 5 else recent_avg
trend = (recent_avg - older_avg) / older_avg if older_avg > 0 else 0
else:
trend = 0
# Determine alert level
alert_level = None
if current_value >= thresholds['critical']:
alert_level = 'critical'
elif current_value >= thresholds['warning']:
alert_level = 'warning'
# Check if we should suppress this alert
if alert_level and not self._should_suppress_alert(metric_name, alert_level, trend):
self._send_alert(metric_name, current_value, alert_level, trend, context)
def _should_suppress_alert(self, metric_name, alert_level, trend):
\"\"\"Suppress alerts based on recent alert history and trends\"\"\"
recent_alerts_key = f\"recent_alerts:{metric_name}:{alert_level}\"
recent_alerts = cache.get(recent_alerts_key, [])
# Don't repeat the same alert within 15 minutes
if recent_alerts and time.time() - recent_alerts[-1] < 900:
return True
# Don't alert for improving trends unless it's critical
if trend < -0.1 and alert_level == 'warning': # 10% improvement
return True
return False
def _send_alert(self, metric_name, value, level, trend, context):
\"\"\"Send alert with context and suggested actions\"\"\"
recent_alerts_key = f\"recent_alerts:{metric_name}:{level}\"
recent_alerts = cache.get(recent_alerts_key, [])
recent_alerts.append(time.time())
cache.set(recent_alerts_key, recent_alerts, 3600)
trend_text = \"increasing\" if trend > 0.1 else \"decreasing\" if trend < -0.1 else \"stable\"
message = f\"[{level.upper()}] {metric_name}: {value} (trend: {trend_text})\"
if context:
message += f\" | Context: {context}\"
# Add suggested actions
suggestions = {
'memory_usage': \"Check for memory leaks, restart affected processes\",
'error_rate': \"Check error logs, recent deployments\",
'response_time': \"Check database queries, server resources\",
'db_connections': \"Check for connection leaks, scale database\"
}
if metric_name in suggestions:
message += f\" | Suggested action: {suggestions[metric_name]}\"
logger.error(message)
# Here you would integrate with Slack, PagerDuty, etc.
This monitoring setup has caught production issues hours before they would have affected users. The key is monitoring what matters and alerting on actionable problems, not just collecting data. For comprehensive monitoring implementations, reference Prometheus Django integration, Grafana Django dashboards, New Relic Django setup, Elastic APM Django, StatsD Django patterns, and CloudWatch Django metrics.