project initialization
Some checks failed
System Monitoring / Health Checks (push) Has been cancelled
System Monitoring / Performance Monitoring (push) Has been cancelled
System Monitoring / Database Monitoring (push) Has been cancelled
System Monitoring / Cache Monitoring (push) Has been cancelled
System Monitoring / Log Monitoring (push) Has been cancelled
System Monitoring / Resource Monitoring (push) Has been cancelled
System Monitoring / Uptime Monitoring (push) Has been cancelled
System Monitoring / Backup Monitoring (push) Has been cancelled
System Monitoring / Security Monitoring (push) Has been cancelled
System Monitoring / Monitoring Dashboard (push) Has been cancelled
System Monitoring / Alerting (push) Has been cancelled
Security Scanning / Dependency Scanning (push) Has been cancelled
Security Scanning / Code Security Scanning (push) Has been cancelled
Security Scanning / Secrets Scanning (push) Has been cancelled
Security Scanning / Container Security Scanning (push) Has been cancelled
Security Scanning / Compliance Checking (push) Has been cancelled
Security Scanning / Security Dashboard (push) Has been cancelled
Security Scanning / Security Remediation (push) Has been cancelled

This commit is contained in:
2025-10-05 02:37:33 +08:00
parent 2cbb6d5fa1
commit b3fff546e9
226 changed files with 97805 additions and 35 deletions

View File

@@ -0,0 +1,584 @@
"""
Alert management system for the Malaysian SME SaaS platform.
Provides comprehensive alerting with Malaysian context.
"""
import json
import logging
import smtplib
import requests
from datetime import datetime, timedelta
from typing import Dict, List, Any, Optional, Callable
from email.mime.text import MimeText
from email.mime.multipart import MimeMultipart
from django.conf import settings
from django.core.mail import send_mail
from django.utils import timezone
from django.db import connection
from prometheus_client import Counter, Gauge
import redis
logger = logging.getLogger(__name__)
# Alert metrics
ALERTS_TOTAL = Counter('alerts_total', 'Total alerts generated', ['severity', 'category'])
ALERTS_RESOLVED = Counter('alerts_resolved_total', 'Total alerts resolved', ['severity', 'category'])
ALERTS_ACTIVE = Gauge('alerts_active', 'Currently active alerts', ['severity', 'category'])
class AlertSeverity:
"""Alert severity levels."""
INFO = 'info'
WARNING = 'warning'
ERROR = 'error'
CRITICAL = 'critical'
class AlertCategory:
"""Alert categories."""
SYSTEM = 'system'
APPLICATION = 'application'
DATABASE = 'database'
CACHE = 'cache'
SECURITY = 'security'
BUSINESS = 'business'
MALAYSIAN = 'malaysian'
class Alert:
"""Single alert instance."""
def __init__(
self,
title: str,
description: str,
severity: str,
category: str,
metadata: Optional[Dict[str, Any]] = None
):
self.id = f"{int(timezone.now().timestamp())}-{hash(title)}"
self.title = title
self.description = description
self.severity = severity
self.category = category
self.metadata = metadata or {}
self.created_at = timezone.now()
self.resolved_at = None
self.acknowledged_at = None
self.acknowledged_by = None
self.tags = self.metadata.get('tags', [])
self.source = self.metadata.get('source', 'system')
self.tenant = self.metadata.get('tenant', 'all')
def to_dict(self) -> Dict[str, Any]:
"""Convert alert to dictionary."""
return {
'id': self.id,
'title': self.title,
'description': self.description,
'severity': self.severity,
'category': self.category,
'metadata': self.metadata,
'created_at': self.created_at.isoformat(),
'resolved_at': self.resolved_at.isoformat() if self.resolved_at else None,
'acknowledged_at': self.acknowledged_at.isoformat() if self.acknowledged_at else None,
'acknowledged_by': self.acknowledged_by,
'tags': self.tags,
'source': self.source,
'tenant': self.tenant,
'status': self.get_status(),
}
def get_status(self) -> str:
"""Get alert status."""
if self.resolved_at:
return 'resolved'
elif self.acknowledged_at:
return 'acknowledged'
else:
return 'active'
def acknowledge(self, user: str):
"""Acknowledge alert."""
self.acknowledged_at = timezone.now()
self.acknowledged_by = user
logger.info(f"Alert {self.id} acknowledged by {user}")
def resolve(self, user: Optional[str] = None):
"""Resolve alert."""
self.resolved_at = timezone.now()
if user:
self.acknowledged_by = user
logger.info(f"Alert {self.id} resolved by {user or 'system'}")
# Update metrics
ALERTS_RESOLVED.labels(
severity=self.severity,
category=self.category
).inc()
class AlertRule:
"""Alert rule definition."""
def __init__(
self,
name: str,
condition: Callable[[], bool],
title_template: str,
description_template: str,
severity: str,
category: str,
cooldown_minutes: int = 15,
enabled: bool = True
):
self.name = name
self.condition = condition
self.title_template = title_template
self.description_template = description_template
self.severity = severity
self.category = category
self.cooldown_minutes = cooldown_minutes
self.enabled = enabled
self.last_triggered = None
self.metadata = {}
def should_trigger(self) -> bool:
"""Check if rule should trigger alert."""
if not self.enabled:
return False
# Check cooldown
if self.last_triggered:
cooldown_until = self.last_triggered + timedelta(minutes=self.cooldown_minutes)
if timezone.now() < cooldown_until:
return False
# Check condition
try:
return self.condition()
except Exception as e:
logger.error(f"Error checking alert rule {self.name}: {e}")
return False
def trigger(self, metadata: Optional[Dict[str, Any]] = None) -> Alert:
"""Trigger alert from rule."""
self.last_triggered = timezone.now()
self.metadata = metadata or {}
# Format title and description
title = self.title_template.format(**self.metadata)
description = self.description_template.format(**self.metadata)
return Alert(
title=title,
description=description,
severity=self.severity,
category=self.category,
metadata={**self.metadata, 'rule_name': self.name}
)
class AlertManager:
"""Main alert management system."""
def __init__(self):
self.rules: List[AlertRule] = []
self.active_alerts: Dict[str, Alert] = {}
self.alert_history: List[Alert] = []
self.notifiers = []
self.redis_client = None
self.initialize_redis()
self.setup_default_rules()
self.setup_notifiers()
def initialize_redis(self):
"""Initialize Redis connection for alert persistence."""
try:
self.redis_client = redis.from_url(settings.REDIS_URL)
except Exception as e:
logger.warning(f"Failed to initialize Redis for alerts: {e}")
def setup_default_rules(self):
"""Setup default alert rules."""
# System alerts
self.add_rule(AlertRule(
name='high_cpu_usage',
condition=self._check_high_cpu_usage,
title_template='High CPU Usage Detected',
description_template='CPU usage is {cpu_usage}% on server {server}',
severity=AlertSeverity.WARNING,
category=AlertCategory.SYSTEM,
cooldown_minutes=10
))
self.add_rule(AlertRule(
name='critical_cpu_usage',
condition=self._check_critical_cpu_usage,
title_template='Critical CPU Usage',
description_template='CPU usage is {cpu_usage}% on server {server} - immediate attention required',
severity=AlertSeverity.CRITICAL,
category=AlertCategory.SYSTEM,
cooldown_minutes=5
))
# Database alerts
self.add_rule(AlertRule(
name='database_connection_errors',
condition=self._check_database_connection_errors,
title_template='Database Connection Errors',
description_template='Database connection errors detected: {error_count} errors in the last 5 minutes',
severity=AlertSeverity.ERROR,
category=AlertCategory.DATABASE,
cooldown_minutes=5
))
# Application alerts
self.add_rule(AlertRule(
name='high_error_rate',
condition=self._check_high_error_rate,
title_template='High Application Error Rate',
description_template='Application error rate is {error_rate}% (threshold: 5%)',
severity=AlertSeverity.WARNING,
category=AlertCategory.APPLICATION,
cooldown_minutes=15
))
# Business alerts
self.add_rule(AlertRule(
name='low_active_users',
condition=self._check_low_active_users,
title_template='Low Active Users',
description_template='Only {active_users} active users detected (threshold: {threshold})',
severity=AlertSeverity.INFO,
category=AlertCategory.BUSINESS,
cooldown_minutes=60
))
# Malaysian-specific alerts
self.add_rule(AlertRule(
name='malaysian_service_degradation',
condition=self._check_malaysian_service_degradation,
title_template='Malaysian Service Degradation',
description_template='Malaysian service availability is {availability}% (threshold: 99%)',
severity=AlertSeverity.WARNING,
category=AlertCategory.MALAYSIAN,
cooldown_minutes=10
))
# Security alerts
self.add_rule(AlertRule(
name='suspicious_login_activity',
condition=self._check_suspicious_login_activity,
title_template='Suspicious Login Activity',
description_template='Detected {failed_logins} failed login attempts from IP {ip_address}',
severity=AlertSeverity.WARNING,
category=AlertCategory.SECURITY,
cooldown_minutes=15
))
def setup_notifiers(self):
"""Setup notification channels."""
# Email notifier
if settings.EMAIL_HOST:
self.add_notifier(EmailNotifier())
# Slack notifier
if hasattr(settings, 'SLACK_WEBHOOK_URL'):
self.add_notifier(SlackNotifier())
# SMS notifier for critical alerts (Malaysian numbers)
if hasattr(settings, 'SMS_API_KEY'):
self.add_notifier(SMSNotifier())
def add_rule(self, rule: AlertRule):
"""Add alert rule."""
self.rules.append(rule)
logger.info(f"Added alert rule: {rule.name}")
def add_notifier(self, notifier):
"""Add notification channel."""
self.notifiers.append(notifier)
logger.info(f"Added notifier: {notifier.__class__.__name__}")
def check_rules(self):
"""Check all alert rules and trigger if needed."""
for rule in self.rules:
try:
if rule.should_trigger():
alert = rule.trigger()
self.trigger_alert(alert)
except Exception as e:
logger.error(f"Error checking rule {rule.name}: {e}")
def trigger_alert(self, alert: Alert):
"""Trigger new alert."""
# Check if similar active alert exists
for existing_alert in self.active_alerts.values():
if (existing_alert.title == alert.title and
existing_alert.severity == alert.severity and
existing_alert.get_status() == 'active'):
logger.debug(f"Similar alert already active: {existing_alert.id}")
return
# Add alert
self.active_alerts[alert.id] = alert
self.alert_history.append(alert)
# Update metrics
ALERTS_TOTAL.labels(
severity=alert.severity,
category=alert.category
).inc()
# Keep only recent history
if len(self.alert_history) > 1000:
self.alert_history = self.alert_history[-1000:]
# Store in Redis
if self.redis_client:
try:
self.redis_client.setex(
f"alert:{alert.id}",
86400, # 24 hours
json.dumps(alert.to_dict())
)
except Exception as e:
logger.error(f"Failed to store alert in Redis: {e}")
# Send notifications
self.send_notifications(alert)
logger.warning(f"Alert triggered: {alert.title} ({alert.severity})")
def resolve_alert(self, alert_id: str, user: Optional[str] = None):
"""Resolve alert."""
if alert_id in self.active_alerts:
alert = self.active_alerts[alert_id]
alert.resolve(user)
del self.active_alerts[alert_id]
# Update Redis
if self.redis_client:
try:
self.redis_client.delete(f"alert:{alert_id}")
except Exception as e:
logger.error(f"Failed to delete alert from Redis: {e}")
logger.info(f"Alert resolved: {alert.title}")
def acknowledge_alert(self, alert_id: str, user: str):
"""Acknowledge alert."""
if alert_id in self.active_alerts:
alert = self.active_alerts[alert_id]
alert.acknowledge(user)
logger.info(f"Alert acknowledged: {alert.title} by {user}")
def get_active_alerts(self, severity: Optional[str] = None, category: Optional[str] = None) -> List[Alert]:
"""Get active alerts with optional filtering."""
alerts = list(self.active_alerts.values())
if severity:
alerts = [a for a in alerts if a.severity == severity]
if category:
alerts = [a for a in alerts if a.category == category]
return alerts
def get_alert_history(self, hours: int = 24) -> List[Alert]:
"""Get alert history for specified hours."""
since = timezone.now() - timedelta(hours=hours)
return [a for a in self.alert_history if a.created_at >= since]
def send_notifications(self, alert: Alert):
"""Send alert notifications."""
for notifier in self.notifiers:
try:
if notifier.should_notify(alert):
notifier.send(alert)
except Exception as e:
logger.error(f"Error sending notification via {notifier.__class__.__name__}: {e}")
# Alert condition methods
def _check_high_cpu_usage(self) -> bool:
"""Check for high CPU usage."""
try:
import psutil
cpu_usage = psutil.cpu_percent(interval=1)
return cpu_usage > 80
except Exception:
return False
def _check_critical_cpu_usage(self) -> bool:
"""Check for critical CPU usage."""
try:
import psutil
cpu_usage = psutil.cpu_percent(interval=1)
return cpu_usage > 90
except Exception:
return False
def _check_database_connection_errors(self) -> bool:
"""Check for database connection errors."""
try:
# This would integrate with your error tracking system
# For now, return False as placeholder
return False
except Exception:
return False
def _check_high_error_rate(self) -> bool:
"""Check for high application error rate."""
try:
# This would check application error rates
# For now, return False as placeholder
return False
except Exception:
return False
def _check_low_active_users(self) -> bool:
"""Check for low active users."""
try:
from django.contrib.auth import get_user_model
User = get_user_model()
five_minutes_ago = timezone.now() - timedelta(minutes=5)
active_count = User.objects.filter(
last_login__gte=five_minutes_ago,
is_active=True
).count()
return active_count < 10
except Exception:
return False
def _check_malaysian_service_degradation(self) -> bool:
"""Check for Malaysian service degradation."""
try:
# This would check Malaysian-specific service health
# For now, return False as placeholder
return False
except Exception:
return False
def _check_suspicious_login_activity(self) -> bool:
"""Check for suspicious login activity."""
try:
# This would check for suspicious login patterns
# For now, return False as placeholder
return False
except Exception:
return False
class EmailNotifier:
"""Email notification system."""
def should_notify(self, alert: Alert) -> bool:
"""Check if should send email notification."""
# Send emails for warnings and above
return alert.severity in [AlertSeverity.WARNING, AlertSeverity.ERROR, AlertSeverity.CRITICAL]
def send(self, alert: Alert):
"""Send email notification."""
try:
subject = f"[{alert.severity.upper()}] {alert.title}"
message = f"""
Alert Details:
- Title: {alert.title}
- Severity: {alert.severity}
- Category: {alert.category}
- Description: {alert.description}
- Time: {alert.created_at}
- Source: {alert.source}
- Tenant: {alert.tenant}
Additional Information:
{json.dumps(alert.metadata, indent=2)}
"""
send_mail(
subject,
message,
settings.DEFAULT_FROM_EMAIL,
settings.ALERT_EMAIL_RECIPIENTS,
fail_silently=False
)
logger.info(f"Email notification sent for alert: {alert.id}")
except Exception as e:
logger.error(f"Failed to send email notification: {e}")
class SlackNotifier:
"""Slack notification system."""
def should_notify(self, alert: Alert) -> bool:
"""Check if should send Slack notification."""
# Send Slack for all alerts
return True
def send(self, alert: Alert):
"""Send Slack notification."""
try:
webhook_url = settings.SLACK_WEBHOOK_URL
# Color based on severity
colors = {
AlertSeverity.INFO: '#36a64f',
AlertSeverity.WARNING: '#ff9500',
AlertSeverity.ERROR: '#ff0000',
AlertSeverity.CRITICAL: '#990000'
}
payload = {
'text': f'{alert.severity.upper()}: {alert.title}',
'attachments': [{
'color': colors.get(alert.severity, '#36a64f'),
'title': alert.title,
'text': alert.description,
'fields': [
{'title': 'Severity', 'value': alert.severity, 'short': True},
{'title': 'Category', 'value': alert.category, 'short': True},
{'title': 'Time', 'value': alert.created_at.strftime('%Y-%m-%d %H:%M:%S'), 'short': True},
{'title': 'Tenant', 'value': alert.tenant, 'short': True},
],
'footer': 'Malaysian SME Platform Alert System',
'ts': int(alert.created_at.timestamp())
}]
}
response = requests.post(webhook_url, json=payload, timeout=10)
response.raise_for_status()
logger.info(f"Slack notification sent for alert: {alert.id}")
except Exception as e:
logger.error(f"Failed to send Slack notification: {e}")
class SMSNotifier:
"""SMS notification system for critical alerts."""
def should_notify(self, alert: Alert) -> bool:
"""Check if should send SMS notification."""
# Only send SMS for critical alerts
return alert.severity == AlertSeverity.CRITICAL
def send(self, alert: Alert):
"""Send SMS notification."""
try:
# This would integrate with Malaysian SMS service
# For now, just log the attempt
logger.info(f"SMS notification would be sent for critical alert: {alert.id}")
# Example integration with Malaysian SMS service
# sms_api_url = settings.SMS_API_URL
# api_key = settings.SMS_API_KEY
# recipients = settings.CRITICAL_ALERT_SMS_RECIPIENTS
# message = f"CRITICAL: {alert.title}. {alert.description[:100]}"
# payload = {
# 'api_key': api_key,
# 'recipients': recipients,
# 'message': message
# }
# response = requests.post(sms_api_url, json=payload, timeout=10)
# response.raise_for_status()
except Exception as e:
logger.error(f"Failed to send SMS notification: {e}")
# Global alert manager instance
alert_manager = AlertManager()

View File

@@ -0,0 +1,709 @@
"""
Prometheus exporters for various system and application metrics.
"""
import time
import logging
import threading
from typing import Dict, Any, List, Optional
from datetime import datetime, timedelta
from django.db import connection, connections
from django.core.cache import cache
from django.conf import settings
from django.contrib.auth import get_user_model
from django.db.models import Count, Q, Avg
from django.utils import timezone
from django_tenants.utils import get_tenant_model, get_tenant_schema_name
from prometheus_client import Gauge, Counter, Histogram, Info, start_http_server
from prometheus_client.core import GaugeMetricFamily, CounterMetricFamily, REGISTRY
import psutil
import redis
from .middleware import (
DATABASE_QUERIES, CACHE_HITS, CACHE_MISSES, MALAYSIAN_OPERATIONS,
TENANT_METRICS, BUSINESS_METRICS, ERROR_EVENTS
)
logger = logging.getLogger(__name__)
User = get_user_model()
TenantModel = get_tenant_model()
class DatabaseExporter:
"""Exporter for database metrics."""
def __init__(self):
self.metrics = {
'database_size': Gauge(
'database_size_bytes',
'Database size in bytes',
['database', 'tenant']
),
'database_connections': Gauge(
'database_connections_current',
'Current database connections',
['state', 'tenant']
),
'database_transactions': Counter(
'database_transactions_total',
'Database transactions',
['type', 'tenant']
),
'database_query_time': Histogram(
'database_query_duration_seconds',
'Database query duration',
['query_type', 'tenant']
),
'database_deadlocks': Counter(
'database_deadlocks_total',
'Database deadlocks',
['tenant']
),
'database_cache_hit_ratio': Gauge(
'database_cache_hit_ratio',
'Database cache hit ratio',
['tenant']
),
}
def collect_metrics(self):
"""Collect database metrics."""
try:
self._collect_database_size()
self._collect_connection_metrics()
self._collect_transaction_metrics()
self._collect_performance_metrics()
self._collect_deadlock_metrics()
except Exception as e:
logger.error(f"Error collecting database metrics: {e}")
def _collect_database_size(self):
"""Collect database size metrics."""
try:
with connection.cursor() as cursor:
cursor.execute("""
SELECT datname, pg_database_size(datname) as size
FROM pg_database
WHERE datistemplate = false
""")
for row in cursor.fetchall():
db_name, size = row
self.metrics['database_size'].labels(
database=db_name,
tenant='all'
).set(size)
except Exception as e:
logger.error(f"Error collecting database size: {e}")
def _collect_connection_metrics(self):
"""Collect connection metrics."""
try:
with connection.cursor() as cursor:
# Current connections
cursor.execute("""
SELECT state, COUNT(*)
FROM pg_stat_activity
WHERE pid <> pg_backend_pid()
GROUP BY state
""")
for state, count in cursor.fetchall():
self.metrics['database_connections'].labels(
state=state or 'idle',
tenant='all'
).set(count)
# Max connections
cursor.execute("SHOW max_connections")
max_connections = cursor.fetchone()[0]
self.metrics['database_connections'].labels(
state='max',
tenant='all'
).set(max_connections)
except Exception as e:
logger.error(f"Error collecting connection metrics: {e}")
def _collect_transaction_metrics(self):
"""Collect transaction metrics."""
try:
with connection.cursor() as cursor:
cursor.execute("""
SELECT datname, xact_commit, xact_rollback
FROM pg_stat_database
""")
for db_name, commits, rollbacks in cursor.fetchall():
self.metrics['database_transactions'].labels(
type='commit',
tenant=db_name
)._value._value.set(commits)
self.metrics['database_transactions'].labels(
type='rollback',
tenant=db_name
)._value._value.set(rollbacks)
except Exception as e:
logger.error(f"Error collecting transaction metrics: {e}")
def _collect_performance_metrics(self):
"""Collect performance metrics."""
try:
with connection.cursor() as cursor:
# Query performance
cursor.execute("""
SELECT query, calls, total_time, mean_time, rows
FROM pg_stat_statements
ORDER BY total_time DESC
LIMIT 100
""")
for query, calls, total_time, mean_time, rows in cursor.fetchall():
query_type = self._classify_query(query)
self.metrics['database_query_time'].labels(
query_type=query_type,
tenant='all'
).observe(mean_time / 1000) # Convert to seconds
# Cache hit ratio
cursor.execute("""
SELECT sum(blks_hit) / (sum(blks_hit) + sum(blks_read)) as hit_ratio
FROM pg_stat_database
""")
hit_ratio = cursor.fetchone()[0]
if hit_ratio:
self.metrics['database_cache_hit_ratio'].labels(
tenant='all'
).set(hit_ratio * 100)
except Exception as e:
logger.error(f"Error collecting performance metrics: {e}")
def _collect_deadlock_metrics(self):
"""Collect deadlock metrics."""
try:
with connection.cursor() as cursor:
cursor.execute("""
SELECT datname, deadlocks
FROM pg_stat_database
""")
for db_name, deadlocks in cursor.fetchall():
if deadlocks > 0:
self.metrics['database_deadlocks'].labels(
tenant=db_name
)._value._value.set(deadlocks)
except Exception as e:
logger.error(f"Error collecting deadlock metrics: {e}")
def _classify_query(self, query: str) -> str:
"""Classify SQL query type."""
query_upper = query.upper()
if query_upper.startswith('SELECT'):
return 'select'
elif query_upper.startswith('INSERT'):
return 'insert'
elif query_upper.startswith('UPDATE'):
return 'update'
elif query_upper.startswith('DELETE'):
return 'delete'
elif query_upper.startswith('CREATE'):
return 'ddl'
elif query_upper.startswith('ALTER'):
return 'ddl'
elif query_upper.startswith('DROP'):
return 'ddl'
else:
return 'other'
class CacheExporter:
"""Exporter for cache metrics."""
def __init__(self):
self.metrics = {
'cache_size': Gauge(
'cache_size_bytes',
'Cache size in bytes',
['cache_type', 'tenant']
),
'cache_items': Gauge(
'cache_items_total',
'Total items in cache',
['cache_type', 'tenant']
),
'cache_operations': Counter(
'cache_operations_total',
'Cache operations',
['operation', 'cache_type', 'tenant']
),
'cache_hit_ratio': Gauge(
'cache_hit_ratio_percent',
'Cache hit ratio percentage',
['cache_type', 'tenant']
),
'cache_evictions': Counter(
'cache_evictions_total',
'Cache evictions',
['cache_type', 'tenant']
),
'cache_memory_usage': Gauge(
'cache_memory_usage_bytes',
'Cache memory usage',
['cache_type', 'tenant']
),
}
def collect_metrics(self):
"""Collect cache metrics."""
try:
self._collect_redis_metrics()
self._collect_django_cache_metrics()
except Exception as e:
logger.error(f"Error collecting cache metrics: {e}")
def _collect_redis_metrics(self):
"""Collect Redis metrics."""
try:
redis_client = redis.Redis.from_url(settings.REDIS_URL)
info = redis_client.info()
# Memory usage
self.metrics['cache_memory_usage'].labels(
cache_type='redis',
tenant='all'
).set(info['used_memory'])
# Key count
self.metrics['cache_items'].labels(
cache_type='redis',
tenant='all'
).set(info['keyspace_hits'] + info['keyspace_misses'])
# Hit ratio
total = info['keyspace_hits'] + info['keyspace_misses']
if total > 0:
hit_ratio = (info['keyspace_hits'] / total) * 100
self.metrics['cache_hit_ratio'].labels(
cache_type='redis',
tenant='all'
).set(hit_ratio)
# Operations
self.metrics['cache_operations'].labels(
operation='get',
cache_type='redis',
tenant='all'
)._value._value.set(info['keyspace_hits'] + info['keyspace_misses'])
except Exception as e:
logger.error(f"Error collecting Redis metrics: {e}")
def _collect_django_cache_metrics(self):
"""Collect Django cache metrics."""
try:
# Get Django cache stats
cache_stats = cache.get_stats()
for backend_name, stats in cache_stats.items():
if 'hits' in stats and 'misses' in stats:
total = stats['hits'] + stats['misses']
if total > 0:
hit_ratio = (stats['hits'] / total) * 100
self.metrics['cache_hit_ratio'].labels(
cache_type='django',
tenant='all'
).set(hit_ratio)
self.metrics['cache_operations'].labels(
operation='get',
cache_type='django',
tenant='all'
)._value._value.set(total)
except Exception as e:
logger.error(f"Error collecting Django cache metrics: {e}")
class SystemExporter:
"""Exporter for system metrics."""
def __init__(self):
self.metrics = {
'system_cpu_usage': Gauge(
'system_cpu_usage_percent',
'System CPU usage percentage'
),
'system_memory_usage': Gauge(
'system_memory_usage_bytes',
'System memory usage'
),
'system_memory_usage_percent': Gauge(
'system_memory_usage_percent',
'System memory usage percentage'
),
'system_disk_usage': Gauge(
'system_disk_usage_bytes',
'System disk usage',
['device', 'mountpoint']
),
'system_disk_usage_percent': Gauge(
'system_disk_usage_percent',
'System disk usage percentage',
['device', 'mountpoint']
),
'system_network_bytes': Counter(
'system_network_bytes_total',
'System network traffic',
['direction', 'interface']
),
'system_load_average': Gauge(
'system_load_average',
'System load average',
['period']
),
'system_uptime': Gauge(
'system_uptime_seconds',
'System uptime in seconds'
),
}
def collect_metrics(self):
"""Collect system metrics."""
try:
self._collect_cpu_metrics()
self._collect_memory_metrics()
self._collect_disk_metrics()
self._collect_network_metrics()
self._collect_load_metrics()
except Exception as e:
logger.error(f"Error collecting system metrics: {e}")
def _collect_cpu_metrics(self):
"""Collect CPU metrics."""
try:
cpu_percent = psutil.cpu_percent(interval=1)
self.metrics['system_cpu_usage'].set(cpu_percent)
# Per-CPU usage
cpu_times = psutil.cpu_times_percent(interval=1)
for i, (cpu_id, percent) in enumerate(psutil.cpu_percent(interval=1, percpu=True)):
self.metrics['system_cpu_usage'].labels(cpu=f'cpu_{i}').set(percent)
except Exception as e:
logger.error(f"Error collecting CPU metrics: {e}")
def _collect_memory_metrics(self):
"""Collect memory metrics."""
try:
memory = psutil.virtual_memory()
self.metrics['system_memory_usage'].set(memory.used)
self.metrics['system_memory_usage_percent'].set(memory.percent)
# Swap memory
swap = psutil.swap_memory()
self.metrics['system_memory_usage'].labels(type='swap').set(swap.used)
self.metrics['system_memory_usage_percent'].labels(type='swap').set(swap.percent)
except Exception as e:
logger.error(f"Error collecting memory metrics: {e}")
def _collect_disk_metrics(self):
"""Collect disk metrics."""
try:
disk_usage = psutil.disk_usage('/')
self.metrics['system_disk_usage'].labels(
device='root',
mountpoint='/'
).set(disk_usage.used)
self.metrics['system_disk_usage_percent'].labels(
device='root',
mountpoint='/'
).set((disk_usage.used / disk_usage.total) * 100)
# Disk I/O
disk_io = psutil.disk_io_counters()
if disk_io:
self.metrics['system_network_bytes'].labels(
direction='read',
interface='disk'
)._value._value.set(disk_io.read_bytes)
self.metrics['system_network_bytes'].labels(
direction='write',
interface='disk'
)._value._value.set(disk_io.write_bytes)
except Exception as e:
logger.error(f"Error collecting disk metrics: {e}")
def _collect_network_metrics(self):
"""Collect network metrics."""
try:
net_io = psutil.net_io_counters()
if net_io:
self.metrics['system_network_bytes'].labels(
direction='recv',
interface='all'
)._value._value.set(net_io.bytes_recv)
self.metrics['system_network_bytes'].labels(
direction='sent',
interface='all'
)._value._value.set(net_io.bytes_sent)
except Exception as e:
logger.error(f"Error collecting network metrics: {e}")
def _collect_load_metrics(self):
"""Collect load average metrics."""
try:
load_avg = psutil.getloadavg()
self.metrics['system_load_average'].labels(period='1min').set(load_avg[0])
self.metrics['system_load_average'].labels(period='5min').set(load_avg[1])
self.metrics['system_load_average'].labels(period='15min').set(load_avg[2])
# System uptime
self.metrics['system_uptime'].set(time.time() - psutil.boot_time())
except Exception as e:
logger.error(f"Error collecting load metrics: {e}")
class BusinessExporter:
"""Exporter for business metrics."""
def __init__(self):
self.metrics = {
'active_users': Gauge(
'business_active_users',
'Number of active users',
['tenant', 'industry_type']
),
'user_registrations': Counter(
'business_user_registrations_total',
'User registrations',
['tenant', 'period']
),
'revenue': Counter(
'business_revenue_myr_total',
'Revenue in Malaysian Ringgit',
['tenant', 'industry_type']
),
'transactions': Counter(
'business_transactions_total',
'Business transactions',
['status', 'tenant', 'payment_method']
),
'tenant_resource_usage': Gauge(
'business_tenant_resource_usage_percent',
'Tenant resource usage percentage',
['tenant', 'resource_type']
),
'malaysian_specific': Counter(
'business_malaysian_operations_total',
'Malaysian-specific operations',
['operation', 'state', 'tenant']
),
}
def collect_metrics(self):
"""Collect business metrics."""
try:
self._collect_user_metrics()
self._collect_revenue_metrics()
self._collect_transaction_metrics()
self._collect_tenant_metrics()
self._collect_malaysian_metrics()
except Exception as e:
logger.error(f"Error collecting business metrics: {e}")
def _collect_user_metrics(self):
"""Collect user metrics."""
try:
# Active users (last 5 minutes)
five_minutes_ago = timezone.now() - timedelta(minutes=5)
active_count = User.objects.filter(
last_login__gte=five_minutes_ago,
is_active=True
).count()
self.metrics['active_users'].labels(
tenant='all',
industry_type='all'
).set(active_count)
# User registrations by period
today = timezone.now().date()
week_ago = today - timedelta(days=7)
month_ago = today - timedelta(days=30)
registrations_today = User.objects.filter(
date_joined__date=today
).count()
registrations_week = User.objects.filter(
date_joined__date__gte=week_ago
).count()
registrations_month = User.objects.filter(
date_joined__date__gte=month_ago
).count()
self.metrics['user_registrations'].labels(
tenant='all',
period='today'
)._value._value.set(registrations_today)
self.metrics['user_registrations'].labels(
tenant='all',
period='week'
)._value._value.set(registrations_week)
self.metrics['user_registrations'].labels(
tenant='all',
period='month'
)._value._value.set(registrations_month)
except Exception as e:
logger.error(f"Error collecting user metrics: {e}")
def _collect_revenue_metrics(self):
"""Collect revenue metrics."""
try:
# This would integrate with your payment system
# For now, we'll use placeholder values
from core.models import Transaction
today = timezone.now().date()
today_revenue = Transaction.objects.filter(
created_at__date=today,
status='completed'
).aggregate(total=Sum('amount'))['total'] or 0
self.metrics['revenue'].labels(
tenant='all',
industry_type='all'
)._value._value.set(today_revenue)
except Exception as e:
logger.error(f"Error collecting revenue metrics: {e}")
def _collect_transaction_metrics(self):
"""Collect transaction metrics."""
try:
from core.models import Transaction
# Transaction counts by status
status_counts = Transaction.objects.values('status').annotate(
count=Count('id')
)
for item in status_counts:
self.metrics['transactions'].labels(
status=item['status'],
tenant='all',
payment_method='all'
)._value._value.set(item['count'])
except Exception as e:
logger.error(f"Error collecting transaction metrics: {e}")
def _collect_tenant_metrics(self):
"""Collect tenant metrics."""
try:
tenants = TenantModel.objects.all()
for tenant in tenants:
# Tenant resource usage (placeholder)
self.metrics['tenant_resource_usage'].labels(
tenant=tenant.name,
resource_type='storage'
).set(50) # Placeholder value
# Tenant active users
active_users = User.objects.filter(
tenant=tenant,
is_active=True,
last_login__gte=timezone.now() - timedelta(minutes=30)
).count()
self.metrics['active_users'].labels(
tenant=tenant.name,
industry_type=getattr(tenant, 'industry_type', 'general')
).set(active_users)
except Exception as e:
logger.error(f"Error collecting tenant metrics: {e}")
def _collect_malaysian_metrics(self):
"""Collect Malaysian-specific metrics."""
try:
from core.models import MalaysianICValidation, SSTCalculation
# IC validations by state
ic_validations = MalaysianICValidation.objects.values(
'state'
).annotate(count=Count('id'))
for item in ic_validations:
self.metrics['malaysian_specific'].labels(
operation='ic_validation',
state=item['state'],
tenant='all'
)._value._value.set(item['count'])
# SST calculations
sst_calculations = SSTCalculation.objects.count()
self.metrics['malaysian_specific'].labels(
operation='sst_calculation',
state='all',
tenant='all'
)._value._value.set(sst_calculations)
except Exception as e:
logger.error(f"Error collecting Malaysian metrics: {e}")
class MetricsCollector:
"""Main metrics collector that runs all exporters."""
def __init__(self):
self.exporters = {
'database': DatabaseExporter(),
'cache': CacheExporter(),
'system': SystemExporter(),
'business': BusinessExporter(),
}
self.running = False
self.thread = None
def start_collection(self, interval: int = 30):
"""Start metrics collection in background thread."""
if not self.running:
self.running = True
self.thread = threading.Thread(target=self._collect_loop, args=(interval,))
self.thread.daemon = True
self.thread.start()
logger.info("Metrics collection started")
def stop_collection(self):
"""Stop metrics collection."""
self.running = False
if self.thread:
self.thread.join()
logger.info("Metrics collection stopped")
def _collect_loop(self, interval: int):
"""Main collection loop."""
while self.running:
try:
for name, exporter in self.exporters.items():
logger.debug(f"Collecting {name} metrics...")
exporter.collect_metrics()
time.sleep(interval)
except Exception as e:
logger.error(f"Error in metrics collection loop: {e}")
time.sleep(interval)
def collect_once(self):
"""Collect metrics once (for testing)."""
for name, exporter in self.exporters.items():
try:
logger.debug(f"Collecting {name} metrics...")
exporter.collect_metrics()
except Exception as e:
logger.error(f"Error collecting {name} metrics: {e}")
# Global metrics collector instance
metrics_collector = MetricsCollector()

View File

@@ -0,0 +1,132 @@
"""
Django management command to start metrics collection.
"""
import time
import signal
import sys
from django.core.management.base import BaseCommand
from django.conf import settings
from ..exporters import metrics_collector
from ..alerts import alert_manager
class Command(BaseCommand):
help = 'Start metrics collection and alert monitoring'
def add_arguments(self, parser):
parser.add_argument(
'--interval',
type=int,
default=30,
help='Metrics collection interval in seconds (default: 30)'
)
parser.add_argument(
'--alert-interval',
type=int,
default=60,
help='Alert checking interval in seconds (default: 60)'
)
parser.add_argument(
'--port',
type=int,
default=8001,
help='Metrics server port (default: 8001)'
)
parser.add_argument(
'--metrics-only',
action='store_true',
help='Only collect metrics, no alerts'
)
parser.add_argument(
'--alerts-only',
action='store_true',
help='Only check alerts, no metrics collection'
)
parser.add_argument(
'--quiet',
action='store_true',
help='Run quietly'
)
def handle(self, *args, **options):
self.interval = options['interval']
self.alert_interval = options['alert_interval']
self.port = options['port']
self.metrics_only = options['metrics_only']
self.alerts_only = options['alerts_only']
self.quiet = options['quiet']
# Set up signal handlers for graceful shutdown
signal.signal(signal.SIGINT, self.signal_handler)
signal.signal(signal.SIGTERM, self.signal_handler)
self.running = True
if not self.quiet:
self.stdout.write(
self.style.SUCCESS('Starting metrics collection and alert monitoring...')
)
self.stdout.write(f'Metrics interval: {self.interval} seconds')
self.stdout.write(f'Alert interval: {self.alert_interval} seconds')
try:
# Start metrics collection
if not self.alerts_only:
if not self.quiet:
self.stdout.write('Starting metrics collection...')
metrics_collector.start_collection(self.interval)
# Start alert monitoring
if not self.metrics_only:
if not self.quiet:
self.stdout.write('Starting alert monitoring...')
self.start_alert_monitoring()
# Keep the command running
if not self.quiet:
self.stdout.write('Monitoring started. Press Ctrl+C to stop.')
while self.running:
time.sleep(1)
except KeyboardInterrupt:
if not self.quiet:
self.stdout.write('\nReceived interrupt signal, stopping...')
finally:
self.shutdown()
def start_alert_monitoring(self):
"""Start alert monitoring in a separate thread."""
import threading
def alert_monitor():
while self.running:
try:
alert_manager.check_rules()
time.sleep(self.alert_interval)
except Exception as e:
if not self.quiet:
self.stdout.write(
self.style.ERROR(f'Error in alert monitoring: {e}')
)
time.sleep(self.alert_interval)
alert_thread = threading.Thread(target=alert_monitor, daemon=True)
alert_thread.start()
def signal_handler(self, signum, frame):
"""Handle shutdown signals."""
if not self.quiet:
self.stdout.write(f'\nReceived signal {signum}, shutting down...')
self.running = False
def shutdown(self):
"""Shutdown the monitoring system."""
if not self.quiet:
self.stdout.write('Shutting down metrics collection...')
metrics_collector.stop_collection()
if not self.quiet:
self.stdout.write(self.style.SUCCESS('Monitoring stopped.'))

View File

@@ -0,0 +1,512 @@
"""
Django middleware for application monitoring and metrics collection.
"""
import time
import logging
import uuid
from datetime import datetime
from typing import Dict, Any, Optional
from django.conf import settings
from django.http import HttpRequest, HttpResponse
from django.contrib.auth import get_user_model
from django.db import connection
from django.utils import timezone
from django_tenants.utils import get_tenant_model
from prometheus_client import Counter, Histogram, Gauge, Info, CollectorRegistry, generate_latest
logger = logging.getLogger(__name__)
User = get_user_model()
TenantModel = get_tenant_model()
# Prometheus metrics
REQUEST_COUNT = Counter(
'http_requests_total',
'Total HTTP requests',
['method', 'endpoint', 'status', 'tenant']
)
REQUEST_DURATION = Histogram(
'http_request_duration_seconds',
'HTTP request duration',
['method', 'endpoint', 'tenant'],
buckets=[0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0]
)
ACTIVE_USERS = Gauge(
'active_users_total',
'Number of active users'
)
DATABASE_CONNECTIONS = Gauge(
'database_connections_total',
'Database connections',
['state']
)
DATABASE_QUERIES = Counter(
'database_queries_total',
'Database queries executed',
['type', 'table', 'tenant']
CACHE_OPERATIONS = Counter(
'cache_operations_total',
'Cache operations',
['operation', 'cache_type', 'tenant']
)
CACHE_HITS = Counter(
'cache_hits_total',
'Cache hits',
['cache_type', 'tenant']
)
CACHE_MISSES = Counter(
'cache_misses_total',
'Cache misses',
['cache_type', 'tenant']
)
MALAYSIAN_OPERATIONS = Counter(
'malaysian_operations_total',
'Malaysian-specific operations',
['operation', 'type']
)
TENANT_METRICS = Gauge(
'tenant_metrics',
'Tenant-specific metrics',
['tenant_id', 'metric_type'],
['tenant_name', 'industry_type']
)
AUTH_EVENTS = Counter(
'auth_events_total',
'Authentication events',
['event_type', 'result', 'tenant']
)
ERROR_EVENTS = Counter(
'error_events_total',
'Application errors',
['error_type', 'severity', 'tenant']
)
BUSINESS_METRICS = Counter(
'business_events_total',
'Business events',
['event_type', 'tenant']
)
SLO_METRICS = Histogram(
'slo_metrics',
'Service Level Objective metrics',
['slo_name', 'tenant']
)
# Application info
APP_INFO = Info('application_info', 'Application information')
APP_INFO.info({
'version': getattr(settings, 'VERSION', '1.0.0'),
'environment': getattr(settings, 'ENVIRONMENT', 'development'),
'django_version': settings.VERSION,
'python_version': settings.PYTHON_VERSION,
'malaysian_sme_platform': 'true'
})
class MonitoringMiddleware:
"""Middleware for comprehensive application monitoring."""
def __init__(self, get_response):
self.get_response = get_response
self.registry = CollectorRegistry()
def __call__(self, request: HttpRequest) -> HttpResponse:
# Generate request ID for tracing
request_id = str(uuid.uuid4())
request.request_id = request_id
# Start timing
start_time = time.time()
# Get tenant info
tenant_info = self._get_tenant_info(request)
# Log request start
self._log_request_start(request, tenant_info)
# Execute request
response = self.get_response(request)
# Calculate metrics
duration = time.time() - start_time
endpoint = self._get_endpoint(request)
status_code = str(response.status_code)
# Record metrics
self._record_request_metrics(request, response, duration, endpoint, tenant_info)
self._record_business_metrics(request, response, tenant_info)
self._record_slo_metrics(request, response, duration, tenant_info)
# Add monitoring headers
self._add_monitoring_headers(response, request_id, duration)
return response
def process_exception(self, request: HttpRequest, exception: Exception) -> Optional[HttpResponse]:
"""Process exceptions and record error metrics."""
tenant_info = self._get_tenant_info(request)
ERROR_EVENTS.labels(
error_type=exception.__class__.__name__,
severity='error',
tenant=tenant_info.get('name', 'unknown')
).inc()
logger.error(f"Request {request.request_id} failed: {exception}", exc_info=True)
return None
def process_template_response(self, request: HttpRequest, response: HttpResponse) -> HttpResponse:
"""Process template responses for additional metrics."""
# Add performance metrics to template context
if hasattr(request, 'tenant'):
response.context_data = response.context_data or {}
response.context_data.update({
'performance_metrics': self._get_performance_metrics(request),
'tenant_metrics': self._get_tenant_metrics(request),
})
return response
def _get_tenant_info(self, request: HttpRequest) -> Dict[str, Any]:
"""Get tenant information from request."""
tenant_info = {'id': None, 'name': 'public', 'schema': 'public'}
if hasattr(request, 'tenant') and request.tenant:
tenant_info = {
'id': request.tenant.id,
'name': request.tenant.name,
'schema': request.tenant.schema_name,
'domain': getattr(request.tenant, 'domain_url', ''),
'industry_type': getattr(request.tenant, 'industry_type', 'general')
}
return tenant_info
def _log_request_start(self, request: HttpRequest, tenant_info: Dict[str, Any]):
"""Log request start information."""
logger.info(
f"Request started: {request.method} {request.path} "
f"(ID: {request.request_id}, Tenant: {tenant_info['name']}, "
f"User: {request.user if request.user.is_authenticated else 'anonymous'})"
)
def _get_endpoint(self, request: HttpRequest) -> str:
"""Extract endpoint name from request."""
try:
if hasattr(request.resolver_match, 'view_name'):
return request.resolver_match.view_name or request.path
return request.path
except AttributeError:
return request.path
def _record_request_metrics(self, request: HttpRequest, response: HttpResponse,
duration: float, endpoint: str, tenant_info: Dict[str, Any]):
"""Record HTTP request metrics."""
tenant_name = tenant_info.get('name', 'unknown')
# Record request count and duration
REQUEST_COUNT.labels(
method=request.method,
endpoint=endpoint,
status=response.status_code,
tenant=tenant_name
).inc()
REQUEST_DURATION.labels(
method=request.method,
endpoint=endpoint,
tenant=tenant_name
).observe(duration)
# Record database queries if available
if hasattr(connection, 'queries') and connection.queries:
for query in connection.queries:
DATABASE_QUERIES.labels(
type='select' if query['sql'].upper().startswith('SELECT') else 'other',
table=self._extract_table_name(query['sql']),
tenant=tenant_name
).inc()
# Log slow requests
if duration > 2.0: # 2 seconds threshold
logger.warning(
f"Slow request detected: {request.method} {request.path} "
f"(Duration: {duration:.2f}s, Tenant: {tenant_name})"
)
def _record_business_metrics(self, request: HttpRequest, response: HttpResponse,
tenant_info: Dict[str, Any]):
"""Record business-specific metrics."""
tenant_name = tenant_info.get('name', 'unknown')
# Track user activity
if request.user.is_authenticated:
BUSINESS_METRICS.labels(
event_type='user_activity',
tenant=tenant_name
).inc()
# Track Malaysian-specific operations
if self._is_malaysian_endpoint(request.path):
BUSINESS_METRICS.labels(
event_type='malaysian_operation',
tenant=tenant_name
).inc()
# Track API calls
if request.path.startswith('/api/'):
BUSINESS_METRICS.labels(
event_type='api_call',
tenant=tenant_name
).inc()
def _record_slo_metrics(self, request: HttpRequest, response: HttpResponse,
duration: float, tenant_info: Dict[str, Any]):
"""Record Service Level Objective metrics."""
tenant_name = tenant_info.get('name', 'unknown')
# Availability SLO
SLO_METRICS.labels(
slo_name='availability',
tenant=tenant_name
).observe(1.0 if response.status_code < 500 else 0.0)
# Latency SLO
latency_slo_value = 1.0 if duration <= 2.0 else max(0.0, 1.0 - (duration - 2.0) / 8.0)
SLO_METRICS.labels(
slo_name='latency',
tenant=tenant_name
).observe(latency_slo_value)
def _add_monitoring_headers(self, response: HttpResponse, request_id: str, duration: float):
"""Add monitoring headers to response."""
response['X-Request-ID'] = request_id
response['X-Response-Time'] = f"{duration:.3f}s"
response['X-Monitoring-Timestamp'] = timezone.now().isoformat()
def _extract_table_name(self, sql: str) -> str:
"""Extract table name from SQL query."""
try:
sql_upper = sql.upper()
if 'FROM ' in sql_upper:
from_part = sql_upper.split('FROM ')[1]
table_name = from_part.split()[0].strip('"[]')
return table_name
except Exception:
pass
return 'unknown'
def _is_malaysian_endpoint(self, path: str) -> bool:
"""Check if endpoint is Malaysian-specific."""
malaysian_endpoints = [
'/api/malaysian/',
'/api/sst/',
'/api/ic-validation/',
'/api/postcode/',
]
return any(path.startswith(endpoint) for endpoint in malaysian_endpoints)
def _get_performance_metrics(self, request: HttpRequest) -> Dict[str, Any]:
"""Get performance metrics for template context."""
return {
'response_time': getattr(request, 'response_time', 0),
'database_queries': len(getattr(connection, 'queries', [])),
'cache_hits': getattr(request, 'cache_hits', 0),
'cache_misses': getattr(request, 'cache_misses', 0),
}
def _get_tenant_metrics(self, request: HttpRequest) -> Dict[str, Any]:
"""Get tenant metrics for template context."""
if hasattr(request, 'tenant') and request.tenant:
return {
'tenant_name': request.tenant.name,
'tenant_users': request.tenant.users.count(),
'tenant_industry': getattr(request.tenant, 'industry_type', 'general'),
'tenant_domain': getattr(request.tenant, 'domain_url', ''),
}
return {}
class DatabaseMonitoringMiddleware:
"""Middleware for database monitoring."""
def __init__(self, get_response):
self.get_response = get_response
def __call__(self, request: HttpRequest) -> HttpResponse:
# Reset query tracking
initial_queries = len(getattr(connection, 'queries', []))
response = self.get_response(request)
# Calculate query metrics
final_queries = len(getattr(connection, 'queries', []))
queries_executed = final_queries - initial_queries
# Update metrics
if hasattr(request, 'tenant'):
tenant_name = request.tenant.name
DATABASE_QUERIES.labels(
type='total',
table='all',
tenant=tenant_name
).inc(queries_executed)
return response
class CacheMonitoringMiddleware:
"""Middleware for cache monitoring."""
def __init__(self, get_response):
self.get_response = get_response
def __call__(self, request: HttpRequest) -> HttpResponse:
# Initialize cache metrics
request.cache_hits = 0
request.cache_misses = 0
response = self.get_response(request)
# Record cache metrics
if hasattr(request, 'tenant'):
tenant_name = request.tenant.name
CACHE_HITS.labels(
cache_type='django',
tenant=tenant_name
).inc(request.cache_hits)
CACHE_MISSES.labels(
cache_type='django',
tenant=tenant_name
).inc(request.cache_misses)
return response
class SecurityMonitoringMiddleware:
"""Middleware for security monitoring."""
def __init__(self, get_response):
self.get_response = get_response
def __call__(self, request: HttpRequest) -> HttpResponse:
# Security checks before request
self._check_security_headers(request)
self._check_rate_limits(request)
response = self.get_response(request)
# Security monitoring after response
self._monitor_security_events(request, response)
return response
def _check_security_headers(self, request: HttpRequest):
"""Check for security-related headers."""
# Check for suspicious user agents
user_agent = request.META.get('HTTP_USER_AGENT', '')
suspicious_agents = ['sqlmap', 'nikto', 'nmap', 'curl', 'wget']
if any(agent in user_agent.lower() for agent in suspicious_agents):
ERROR_EVENTS.labels(
error_type='suspicious_user_agent',
severity='warning',
tenant='unknown'
).inc()
def _check_rate_limits(self, request: HttpRequest):
"""Check rate limits."""
ip_address = self._get_client_ip(request)
# Implement rate limiting logic here
# This would typically use Redis to track request rates
def _monitor_security_events(self, request: HttpRequest, response: HttpResponse):
"""Monitor security-related events."""
# Monitor for authentication attempts
if '/login' in request.path:
if response.status_code == 200:
AUTH_EVENTS.labels(
event_type='login_attempt',
result='success',
tenant=getattr(request, 'tenant', {}).get('name', 'unknown')
).inc()
else:
AUTH_EVENTS.labels(
event_type='login_attempt',
result='failed',
tenant=getattr(request, 'tenant', {}).get('name', 'unknown')
).inc()
# Monitor for SQL injection attempts
if self._detect_sql_injection(request):
ERROR_EVENTS.labels(
error_type='sql_injection_attempt',
severity='critical',
tenant=getattr(request, 'tenant', {}).get('name', 'unknown')
).inc()
def _get_client_ip(self, request: HttpRequest) -> str:
"""Get client IP address."""
x_forwarded_for = request.META.get('HTTP_X_FORWARDED_FOR')
if x_forwarded_for:
ip = x_forwarded_for.split(',')[0]
else:
ip = request.META.get('REMOTE_ADDR')
return ip
def _detect_sql_injection(self, request: HttpRequest) -> bool:
"""Detect potential SQL injection attempts."""
sql_injection_patterns = [
"' OR '1'='1",
"DROP TABLE",
"UNION SELECT",
"INSERT INTO",
"UPDATE SET",
"DELETE FROM",
"--",
"/*",
"*/"
]
# Check GET parameters
for value in request.GET.values():
if any(pattern.upper() in value.upper() for pattern in sql_injection_patterns):
return True
# Check POST data
if hasattr(request, 'POST'):
for value in request.POST.values():
if any(pattern.upper() in value.upper() for pattern in sql_injection_patterns):
return True
return False
class MetricsView:
"""View for exposing Prometheus metrics."""
def __init__(self):
self.registry = CollectorRegistry()
def get_metrics(self) -> str:
"""Get all metrics in Prometheus format."""
return generate_latest(self.registry).decode('utf-8')
def get_health_metrics(self) -> Dict[str, Any]:
"""Get health metrics."""
return {
'application_info': APP_INFO.info,
'active_users': ACTIVE_USERS._value.get(),
'database_connections': DATABASE_CONNECTIONS._value.get(),
'timestamp': timezone.now().isoformat(),
}

481
backend/monitoring/views.py Normal file
View File

@@ -0,0 +1,481 @@
"""
Django views for monitoring and metrics endpoints.
"""
import json
from datetime import datetime, timedelta
from typing import Dict, Any, Optional
from django.http import JsonResponse, HttpResponse
from django.views.generic import TemplateView
from django.contrib.auth.mixins import LoginRequiredMixin
from django.db import connection
from django.utils import timezone
from django.conf import settings
from rest_framework.views import APIView
from rest_framework.response import Response
from rest_framework import status
from prometheus_client import generate_latest, REGISTRY, CONTENT_TYPE_LATEST
from prometheus_client.parser import text_string_to_metric_families
from .middleware import MetricsView
from .exporters import metrics_collector
from .alerts import alert_manager, Alert, AlertSeverity, AlertCategory
class MetricsView(APIView):
"""View for Prometheus metrics endpoint."""
def get(self, request):
"""Get Prometheus metrics."""
try:
# Collect current metrics
metrics_collector.collect_once()
# Generate Prometheus format
metrics_data = generate_latest(REGISTRY)
return HttpResponse(
metrics_data,
content_type=CONTENT_TYPE_LATEST
)
except Exception as e:
return JsonResponse(
{'error': f'Failed to generate metrics: {str(e)}'},
status=status.HTTP_500_INTERNAL_SERVER_ERROR
)
class HealthCheckView(APIView):
"""Health check endpoint."""
def get(self, request):
"""Comprehensive health check."""
try:
health_status = {
'status': 'healthy',
'timestamp': timezone.now().isoformat(),
'version': getattr(settings, 'VERSION', '1.0.0'),
'environment': getattr(settings, 'ENVIRONMENT', 'development'),
'checks': {}
}
# Database health
try:
with connection.cursor() as cursor:
cursor.execute("SELECT 1")
cursor.fetchone()
health_status['checks']['database'] = {
'status': 'healthy',
'response_time': self._measure_response_time(self._check_database)
}
except Exception as e:
health_status['checks']['database'] = {
'status': 'unhealthy',
'error': str(e)
}
health_status['status'] = 'degraded'
# Cache health
try:
from django.core.cache import cache
cache.set('health_check', 'test', 1)
cache.get('health_check')
health_status['checks']['cache'] = {
'status': 'healthy',
'response_time': self._measure_response_time(self._check_cache)
}
except Exception as e:
health_status['checks']['cache'] = {
'status': 'unhealthy',
'error': str(e)
}
health_status['status'] = 'degraded'
# Storage health
try:
storage_health = self._check_storage()
health_status['checks']['storage'] = storage_health
if storage_health['status'] != 'healthy':
health_status['status'] = 'degraded'
except Exception as e:
health_status['checks']['storage'] = {
'status': 'unhealthy',
'error': str(e)
}
health_status['status'] = 'degraded'
# External services health
external_services = self._check_external_services()
health_status['checks']['external_services'] = external_services
if any(service['status'] != 'healthy' for service in external_services.values()):
health_status['status'] = 'degraded'
# Malaysian services health
malaysian_services = self._check_malaysian_services()
health_status['checks']['malaysian_services'] = malaysian_services
if any(service['status'] != 'healthy' for service in malaysian_services.values()):
health_status['status'] = 'degraded'
# Determine HTTP status code
if health_status['status'] == 'healthy':
http_status = status.HTTP_200_OK
elif health_status['status'] == 'degraded':
http_status = status.HTTP_503_SERVICE_UNAVAILABLE
else:
http_status = status.HTTP_500_INTERNAL_SERVER_ERROR
return Response(health_status, status=http_status)
except Exception as e:
return Response(
{'status': 'unhealthy', 'error': str(e)},
status=status.HTTP_500_INTERNAL_SERVER_ERROR
)
def _measure_response_time(self, func) -> float:
"""Measure response time of a function."""
import time
start_time = time.time()
func()
return time.time() - start_time
def _check_database(self):
"""Check database connectivity."""
with connection.cursor() as cursor:
cursor.execute("SELECT 1")
cursor.fetchone()
def _check_cache(self):
"""Check cache functionality."""
from django.core.cache import cache
cache.set('health_check', 'test', 1)
result = cache.get('health_check')
if result != 'test':
raise Exception("Cache functionality failed")
def _check_storage(self) -> Dict[str, Any]:
"""Check storage availability."""
try:
import os
# Check media directory
media_path = getattr(settings, 'MEDIA_ROOT', '/media')
if not os.path.exists(media_path):
return {'status': 'unhealthy', 'error': 'Media directory not found'}
# Check write permissions
test_file = os.path.join(media_path, 'health_check_test.tmp')
try:
with open(test_file, 'w') as f:
f.write('test')
os.remove(test_file)
except Exception as e:
return {'status': 'unhealthy', 'error': f'Write permission error: {str(e)}'}
# Check disk space
disk_usage = os.statvfs(media_path)
free_space_percent = (disk_usage.f_bavail * disk_usage.f_frsize) / (disk_usage.f_blocks * disk_usage.f_frsize) * 100
if free_space_percent < 10:
return {
'status': 'degraded',
'error': f'Low disk space: {free_space_percent:.1f}% free'
}
return {'status': 'healthy', 'free_space_percent': free_space_percent}
except Exception as e:
return {'status': 'unhealthy', 'error': str(e)}
def _check_external_services(self) -> Dict[str, Dict[str, Any]]:
"""Check external services health."""
services = {}
# Check email service
try:
from django.core.mail import get_connection
connection = get_connection()
connection.open()
connection.close()
services['email'] = {'status': 'healthy'}
except Exception as e:
services['email'] = {'status': 'unhealthy', 'error': str(e)}
# Check Redis (if configured)
try:
import redis
redis_client = redis.from_url(settings.REDIS_URL)
redis_client.ping()
services['redis'] = {'status': 'healthy'}
except Exception as e:
services['redis'] = {'status': 'unhealthy', 'error': str(e)}
# Check external APIs (if configured)
external_apis = getattr(settings, 'EXTERNAL_APIS', {})
for api_name, api_config in external_apis.items():
try:
import requests
response = requests.get(
api_config['health_url'],
timeout=api_config.get('timeout', 5)
)
if response.status_code == 200:
services[api_name] = {'status': 'healthy'}
else:
services[api_name] = {
'status': 'unhealthy',
'error': f'HTTP {response.status_code}'
}
except Exception as e:
services[api_name] = {'status': 'unhealthy', 'error': str(e)}
return services
def _check_malaysian_services(self) -> Dict[str, Dict[str, Any]]:
"""Check Malaysian-specific services."""
services = {}
# Check Malaysian postcode service
try:
from core.services.malaysian_services import MalaysianPostcodeService
postcode_service = MalaysianPostcodeService()
result = postcode_service.lookup_postcode('50000')
services['postcode_service'] = {
'status': 'healthy' if result else 'degraded'
}
except Exception as e:
services['postcode_service'] = {'status': 'unhealthy', 'error': str(e)}
# Check SST calculation service
try:
from core.services.malaysian_services import SSTCalculationService
sst_service = SSTCalculationService()
result = sst_service.calculate_sst(100, 'standard', 'Johor')
services['sst_service'] = {
'status': 'healthy' if result is not None else 'degraded'
}
except Exception as e:
services['sst_service'] = {'status': 'unhealthy', 'error': str(e)}
# Check IC validation service
try:
from core.services.malaysian_services import ICValidationService
ic_service = ICValidationService()
result = ic_service.validate_ic('1234567890')
services['ic_validation_service'] = {
'status': 'healthy' if result is not None else 'degraded'
}
except Exception as e:
services['ic_validation_service'] = {'status': 'unhealthy', 'error': str(e)}
return services
class AlertsView(APIView):
"""Alerts management endpoint."""
def get(self, request):
"""Get alerts."""
try:
# Check for new alerts
alert_manager.check_rules()
# Get query parameters
severity = request.query_params.get('severity')
category = request.query_params.get('category')
status = request.query_params.get('status', 'active')
hours = int(request.query_params.get('hours', 24))
# Get alerts
if status == 'active':
alerts = alert_manager.get_active_alerts(severity=severity, category=category)
else:
alerts = alert_manager.get_alert_history(hours=hours)
if severity:
alerts = [a for a in alerts if a.severity == severity]
if category:
alerts = [a for a in alerts if a.category == category]
# Convert to response format
response_data = {
'alerts': [alert.to_dict() for alert in alerts],
'summary': self._get_alerts_summary(alerts),
'timestamp': timezone.now().isoformat()
}
return Response(response_data)
except Exception as e:
return Response(
{'error': f'Failed to get alerts: {str(e)}'},
status=status.HTTP_500_INTERNAL_SERVER_ERROR
)
def post(self, request):
"""Create manual alert."""
try:
data = request.data
alert = Alert(
title=data['title'],
description=data['description'],
severity=data.get('severity', AlertSeverity.INFO),
category=data.get('category', AlertCategory.SYSTEM),
metadata=data.get('metadata', {})
)
alert_manager.trigger_alert(alert)
return Response(
{'message': 'Alert created successfully', 'alert_id': alert.id},
status=status.HTTP_201_CREATED
)
except Exception as e:
return Response(
{'error': f'Failed to create alert: {str(e)}'},
status=status.HTTP_500_INTERNAL_SERVER_ERROR
)
def _get_alerts_summary(self, alerts) -> Dict[str, Any]:
"""Get alerts summary statistics."""
summary = {
'total': len(alerts),
'by_severity': {},
'by_category': {},
'by_status': {}
}
for alert in alerts:
# Count by severity
summary['by_severity'][alert.severity] = summary['by_severity'].get(alert.severity, 0) + 1
# Count by category
summary['by_category'][alert.category] = summary['by_category'].get(alert.category, 0) + 1
# Count by status
status = alert.get_status()
summary['by_status'][status] = summary['by_status'].get(status, 0) + 1
return summary
class AlertActionView(APIView):
"""Alert management actions."""
def post(self, request, alert_id: str, action: str):
"""Perform alert actions."""
try:
if action == 'acknowledge':
user = request.user.username if request.user.is_authenticated else 'api_user'
alert_manager.acknowledge_alert(alert_id, user)
return Response({'message': f'Alert {alert_id} acknowledged'})
elif action == 'resolve':
user = request.user.username if request.user.is_authenticated else 'api_user'
alert_manager.resolve_alert(alert_id, user)
return Response({'message': f'Alert {alert_id} resolved'})
else:
return Response(
{'error': f'Unknown action: {action}'},
status=status.HTTP_400_BAD_REQUEST
)
except Exception as e:
return Response(
{'error': f'Failed to perform action {action} on alert {alert_id}: {str(e)}'},
status=status.HTTP_500_INTERNAL_SERVER_ERROR
)
class MonitoringDashboardView(LoginRequiredMixin, TemplateView):
"""Monitoring dashboard template view."""
template_name = 'monitoring/dashboard.html'
def get_context_data(self, **kwargs):
"""Get dashboard context data."""
context = super().get_context_data(**kwargs)
# Get current alerts
context['active_alerts'] = alert_manager.get_active_alerts()
context['alert_summary'] = self._get_alerts_summary(context['active_alerts'])
# Get system metrics
context['system_metrics'] = self._get_system_metrics()
# Get business metrics
context['business_metrics'] = self._get_business_metrics()
# Malaysian-specific metrics
context['malaysian_metrics'] = self._get_malaysian_metrics()
return context
def _get_alerts_summary(self, alerts) -> Dict[str, Any]:
"""Get alerts summary."""
summary = {
'total': len(alerts),
'critical': len([a for a in alerts if a.severity == AlertSeverity.CRITICAL]),
'error': len([a for a in alerts if a.severity == AlertSeverity.ERROR]),
'warning': len([a for a in alerts if a.severity == AlertSeverity.WARNING]),
'info': len([a for a in alerts if a.severity == AlertSeverity.INFO]),
}
return summary
def _get_system_metrics(self) -> Dict[str, Any]:
"""Get system metrics."""
try:
import psutil
return {
'cpu_usage': psutil.cpu_percent(interval=1),
'memory_usage': psutil.virtual_memory().percent,
'disk_usage': psutil.disk_usage('/').percent,
'load_average': psutil.getloadavg()[0],
'uptime': datetime.now() - datetime.fromtimestamp(psutil.boot_time()),
}
except Exception:
return {}
def _get_business_metrics(self) -> Dict[str, Any]:
"""Get business metrics."""
try:
from django.contrib.auth import get_user_model
from core.models import Transaction
User = get_user_model()
# Active users
active_users = User.objects.filter(
is_active=True,
last_login__gte=timezone.now() - timedelta(minutes=30)
).count()
# Today's transactions
today_transactions = Transaction.objects.filter(
created_at__date=timezone.now().date(),
status='completed'
).count()
return {
'active_users': active_users,
'today_transactions': today_transactions,
}
except Exception:
return {}
def _get_malaysian_metrics(self) -> Dict[str, Any]:
"""Get Malaysian-specific metrics."""
try:
from core.models import MalaysianICValidation, SSTCalculation
return {
'ic_validations_today': MalaysianICValidation.objects.filter(
created_at__date=timezone.now().date()
).count(),
'sst_calculations_today': SSTCalculation.objects.filter(
created_at__date=timezone.now().date()
).count(),
}
except Exception:
return {}
class MetricsDashboardView(LoginRequiredMixin, TemplateView):
"""Metrics dashboard template view."""
template_name = 'monitoring/metrics_dashboard.html'