project initialization
Some checks failed
System Monitoring / Health Checks (push) Has been cancelled
System Monitoring / Performance Monitoring (push) Has been cancelled
System Monitoring / Database Monitoring (push) Has been cancelled
System Monitoring / Cache Monitoring (push) Has been cancelled
System Monitoring / Log Monitoring (push) Has been cancelled
System Monitoring / Resource Monitoring (push) Has been cancelled
System Monitoring / Uptime Monitoring (push) Has been cancelled
System Monitoring / Backup Monitoring (push) Has been cancelled
System Monitoring / Security Monitoring (push) Has been cancelled
System Monitoring / Monitoring Dashboard (push) Has been cancelled
System Monitoring / Alerting (push) Has been cancelled
Security Scanning / Dependency Scanning (push) Has been cancelled
Security Scanning / Code Security Scanning (push) Has been cancelled
Security Scanning / Secrets Scanning (push) Has been cancelled
Security Scanning / Container Security Scanning (push) Has been cancelled
Security Scanning / Compliance Checking (push) Has been cancelled
Security Scanning / Security Dashboard (push) Has been cancelled
Security Scanning / Security Remediation (push) Has been cancelled
Some checks failed
System Monitoring / Health Checks (push) Has been cancelled
System Monitoring / Performance Monitoring (push) Has been cancelled
System Monitoring / Database Monitoring (push) Has been cancelled
System Monitoring / Cache Monitoring (push) Has been cancelled
System Monitoring / Log Monitoring (push) Has been cancelled
System Monitoring / Resource Monitoring (push) Has been cancelled
System Monitoring / Uptime Monitoring (push) Has been cancelled
System Monitoring / Backup Monitoring (push) Has been cancelled
System Monitoring / Security Monitoring (push) Has been cancelled
System Monitoring / Monitoring Dashboard (push) Has been cancelled
System Monitoring / Alerting (push) Has been cancelled
Security Scanning / Dependency Scanning (push) Has been cancelled
Security Scanning / Code Security Scanning (push) Has been cancelled
Security Scanning / Secrets Scanning (push) Has been cancelled
Security Scanning / Container Security Scanning (push) Has been cancelled
Security Scanning / Compliance Checking (push) Has been cancelled
Security Scanning / Security Dashboard (push) Has been cancelled
Security Scanning / Security Remediation (push) Has been cancelled
This commit is contained in:
584
backend/monitoring/alerts.py
Normal file
584
backend/monitoring/alerts.py
Normal file
@@ -0,0 +1,584 @@
|
||||
"""
|
||||
Alert management system for the Malaysian SME SaaS platform.
|
||||
Provides comprehensive alerting with Malaysian context.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import smtplib
|
||||
import requests
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, List, Any, Optional, Callable
|
||||
from email.mime.text import MimeText
|
||||
from email.mime.multipart import MimeMultipart
|
||||
from django.conf import settings
|
||||
from django.core.mail import send_mail
|
||||
from django.utils import timezone
|
||||
from django.db import connection
|
||||
from prometheus_client import Counter, Gauge
|
||||
import redis
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Alert metrics
|
||||
ALERTS_TOTAL = Counter('alerts_total', 'Total alerts generated', ['severity', 'category'])
|
||||
ALERTS_RESOLVED = Counter('alerts_resolved_total', 'Total alerts resolved', ['severity', 'category'])
|
||||
ALERTS_ACTIVE = Gauge('alerts_active', 'Currently active alerts', ['severity', 'category'])
|
||||
|
||||
class AlertSeverity:
|
||||
"""Alert severity levels."""
|
||||
INFO = 'info'
|
||||
WARNING = 'warning'
|
||||
ERROR = 'error'
|
||||
CRITICAL = 'critical'
|
||||
|
||||
class AlertCategory:
|
||||
"""Alert categories."""
|
||||
SYSTEM = 'system'
|
||||
APPLICATION = 'application'
|
||||
DATABASE = 'database'
|
||||
CACHE = 'cache'
|
||||
SECURITY = 'security'
|
||||
BUSINESS = 'business'
|
||||
MALAYSIAN = 'malaysian'
|
||||
|
||||
class Alert:
|
||||
"""Single alert instance."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
title: str,
|
||||
description: str,
|
||||
severity: str,
|
||||
category: str,
|
||||
metadata: Optional[Dict[str, Any]] = None
|
||||
):
|
||||
self.id = f"{int(timezone.now().timestamp())}-{hash(title)}"
|
||||
self.title = title
|
||||
self.description = description
|
||||
self.severity = severity
|
||||
self.category = category
|
||||
self.metadata = metadata or {}
|
||||
self.created_at = timezone.now()
|
||||
self.resolved_at = None
|
||||
self.acknowledged_at = None
|
||||
self.acknowledged_by = None
|
||||
self.tags = self.metadata.get('tags', [])
|
||||
self.source = self.metadata.get('source', 'system')
|
||||
self.tenant = self.metadata.get('tenant', 'all')
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert alert to dictionary."""
|
||||
return {
|
||||
'id': self.id,
|
||||
'title': self.title,
|
||||
'description': self.description,
|
||||
'severity': self.severity,
|
||||
'category': self.category,
|
||||
'metadata': self.metadata,
|
||||
'created_at': self.created_at.isoformat(),
|
||||
'resolved_at': self.resolved_at.isoformat() if self.resolved_at else None,
|
||||
'acknowledged_at': self.acknowledged_at.isoformat() if self.acknowledged_at else None,
|
||||
'acknowledged_by': self.acknowledged_by,
|
||||
'tags': self.tags,
|
||||
'source': self.source,
|
||||
'tenant': self.tenant,
|
||||
'status': self.get_status(),
|
||||
}
|
||||
|
||||
def get_status(self) -> str:
|
||||
"""Get alert status."""
|
||||
if self.resolved_at:
|
||||
return 'resolved'
|
||||
elif self.acknowledged_at:
|
||||
return 'acknowledged'
|
||||
else:
|
||||
return 'active'
|
||||
|
||||
def acknowledge(self, user: str):
|
||||
"""Acknowledge alert."""
|
||||
self.acknowledged_at = timezone.now()
|
||||
self.acknowledged_by = user
|
||||
logger.info(f"Alert {self.id} acknowledged by {user}")
|
||||
|
||||
def resolve(self, user: Optional[str] = None):
|
||||
"""Resolve alert."""
|
||||
self.resolved_at = timezone.now()
|
||||
if user:
|
||||
self.acknowledged_by = user
|
||||
logger.info(f"Alert {self.id} resolved by {user or 'system'}")
|
||||
|
||||
# Update metrics
|
||||
ALERTS_RESOLVED.labels(
|
||||
severity=self.severity,
|
||||
category=self.category
|
||||
).inc()
|
||||
|
||||
class AlertRule:
|
||||
"""Alert rule definition."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
name: str,
|
||||
condition: Callable[[], bool],
|
||||
title_template: str,
|
||||
description_template: str,
|
||||
severity: str,
|
||||
category: str,
|
||||
cooldown_minutes: int = 15,
|
||||
enabled: bool = True
|
||||
):
|
||||
self.name = name
|
||||
self.condition = condition
|
||||
self.title_template = title_template
|
||||
self.description_template = description_template
|
||||
self.severity = severity
|
||||
self.category = category
|
||||
self.cooldown_minutes = cooldown_minutes
|
||||
self.enabled = enabled
|
||||
self.last_triggered = None
|
||||
self.metadata = {}
|
||||
|
||||
def should_trigger(self) -> bool:
|
||||
"""Check if rule should trigger alert."""
|
||||
if not self.enabled:
|
||||
return False
|
||||
|
||||
# Check cooldown
|
||||
if self.last_triggered:
|
||||
cooldown_until = self.last_triggered + timedelta(minutes=self.cooldown_minutes)
|
||||
if timezone.now() < cooldown_until:
|
||||
return False
|
||||
|
||||
# Check condition
|
||||
try:
|
||||
return self.condition()
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking alert rule {self.name}: {e}")
|
||||
return False
|
||||
|
||||
def trigger(self, metadata: Optional[Dict[str, Any]] = None) -> Alert:
|
||||
"""Trigger alert from rule."""
|
||||
self.last_triggered = timezone.now()
|
||||
self.metadata = metadata or {}
|
||||
|
||||
# Format title and description
|
||||
title = self.title_template.format(**self.metadata)
|
||||
description = self.description_template.format(**self.metadata)
|
||||
|
||||
return Alert(
|
||||
title=title,
|
||||
description=description,
|
||||
severity=self.severity,
|
||||
category=self.category,
|
||||
metadata={**self.metadata, 'rule_name': self.name}
|
||||
)
|
||||
|
||||
class AlertManager:
|
||||
"""Main alert management system."""
|
||||
|
||||
def __init__(self):
|
||||
self.rules: List[AlertRule] = []
|
||||
self.active_alerts: Dict[str, Alert] = {}
|
||||
self.alert_history: List[Alert] = []
|
||||
self.notifiers = []
|
||||
self.redis_client = None
|
||||
self.initialize_redis()
|
||||
self.setup_default_rules()
|
||||
self.setup_notifiers()
|
||||
|
||||
def initialize_redis(self):
|
||||
"""Initialize Redis connection for alert persistence."""
|
||||
try:
|
||||
self.redis_client = redis.from_url(settings.REDIS_URL)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to initialize Redis for alerts: {e}")
|
||||
|
||||
def setup_default_rules(self):
|
||||
"""Setup default alert rules."""
|
||||
# System alerts
|
||||
self.add_rule(AlertRule(
|
||||
name='high_cpu_usage',
|
||||
condition=self._check_high_cpu_usage,
|
||||
title_template='High CPU Usage Detected',
|
||||
description_template='CPU usage is {cpu_usage}% on server {server}',
|
||||
severity=AlertSeverity.WARNING,
|
||||
category=AlertCategory.SYSTEM,
|
||||
cooldown_minutes=10
|
||||
))
|
||||
|
||||
self.add_rule(AlertRule(
|
||||
name='critical_cpu_usage',
|
||||
condition=self._check_critical_cpu_usage,
|
||||
title_template='Critical CPU Usage',
|
||||
description_template='CPU usage is {cpu_usage}% on server {server} - immediate attention required',
|
||||
severity=AlertSeverity.CRITICAL,
|
||||
category=AlertCategory.SYSTEM,
|
||||
cooldown_minutes=5
|
||||
))
|
||||
|
||||
# Database alerts
|
||||
self.add_rule(AlertRule(
|
||||
name='database_connection_errors',
|
||||
condition=self._check_database_connection_errors,
|
||||
title_template='Database Connection Errors',
|
||||
description_template='Database connection errors detected: {error_count} errors in the last 5 minutes',
|
||||
severity=AlertSeverity.ERROR,
|
||||
category=AlertCategory.DATABASE,
|
||||
cooldown_minutes=5
|
||||
))
|
||||
|
||||
# Application alerts
|
||||
self.add_rule(AlertRule(
|
||||
name='high_error_rate',
|
||||
condition=self._check_high_error_rate,
|
||||
title_template='High Application Error Rate',
|
||||
description_template='Application error rate is {error_rate}% (threshold: 5%)',
|
||||
severity=AlertSeverity.WARNING,
|
||||
category=AlertCategory.APPLICATION,
|
||||
cooldown_minutes=15
|
||||
))
|
||||
|
||||
# Business alerts
|
||||
self.add_rule(AlertRule(
|
||||
name='low_active_users',
|
||||
condition=self._check_low_active_users,
|
||||
title_template='Low Active Users',
|
||||
description_template='Only {active_users} active users detected (threshold: {threshold})',
|
||||
severity=AlertSeverity.INFO,
|
||||
category=AlertCategory.BUSINESS,
|
||||
cooldown_minutes=60
|
||||
))
|
||||
|
||||
# Malaysian-specific alerts
|
||||
self.add_rule(AlertRule(
|
||||
name='malaysian_service_degradation',
|
||||
condition=self._check_malaysian_service_degradation,
|
||||
title_template='Malaysian Service Degradation',
|
||||
description_template='Malaysian service availability is {availability}% (threshold: 99%)',
|
||||
severity=AlertSeverity.WARNING,
|
||||
category=AlertCategory.MALAYSIAN,
|
||||
cooldown_minutes=10
|
||||
))
|
||||
|
||||
# Security alerts
|
||||
self.add_rule(AlertRule(
|
||||
name='suspicious_login_activity',
|
||||
condition=self._check_suspicious_login_activity,
|
||||
title_template='Suspicious Login Activity',
|
||||
description_template='Detected {failed_logins} failed login attempts from IP {ip_address}',
|
||||
severity=AlertSeverity.WARNING,
|
||||
category=AlertCategory.SECURITY,
|
||||
cooldown_minutes=15
|
||||
))
|
||||
|
||||
def setup_notifiers(self):
|
||||
"""Setup notification channels."""
|
||||
# Email notifier
|
||||
if settings.EMAIL_HOST:
|
||||
self.add_notifier(EmailNotifier())
|
||||
|
||||
# Slack notifier
|
||||
if hasattr(settings, 'SLACK_WEBHOOK_URL'):
|
||||
self.add_notifier(SlackNotifier())
|
||||
|
||||
# SMS notifier for critical alerts (Malaysian numbers)
|
||||
if hasattr(settings, 'SMS_API_KEY'):
|
||||
self.add_notifier(SMSNotifier())
|
||||
|
||||
def add_rule(self, rule: AlertRule):
|
||||
"""Add alert rule."""
|
||||
self.rules.append(rule)
|
||||
logger.info(f"Added alert rule: {rule.name}")
|
||||
|
||||
def add_notifier(self, notifier):
|
||||
"""Add notification channel."""
|
||||
self.notifiers.append(notifier)
|
||||
logger.info(f"Added notifier: {notifier.__class__.__name__}")
|
||||
|
||||
def check_rules(self):
|
||||
"""Check all alert rules and trigger if needed."""
|
||||
for rule in self.rules:
|
||||
try:
|
||||
if rule.should_trigger():
|
||||
alert = rule.trigger()
|
||||
self.trigger_alert(alert)
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking rule {rule.name}: {e}")
|
||||
|
||||
def trigger_alert(self, alert: Alert):
|
||||
"""Trigger new alert."""
|
||||
# Check if similar active alert exists
|
||||
for existing_alert in self.active_alerts.values():
|
||||
if (existing_alert.title == alert.title and
|
||||
existing_alert.severity == alert.severity and
|
||||
existing_alert.get_status() == 'active'):
|
||||
logger.debug(f"Similar alert already active: {existing_alert.id}")
|
||||
return
|
||||
|
||||
# Add alert
|
||||
self.active_alerts[alert.id] = alert
|
||||
self.alert_history.append(alert)
|
||||
|
||||
# Update metrics
|
||||
ALERTS_TOTAL.labels(
|
||||
severity=alert.severity,
|
||||
category=alert.category
|
||||
).inc()
|
||||
|
||||
# Keep only recent history
|
||||
if len(self.alert_history) > 1000:
|
||||
self.alert_history = self.alert_history[-1000:]
|
||||
|
||||
# Store in Redis
|
||||
if self.redis_client:
|
||||
try:
|
||||
self.redis_client.setex(
|
||||
f"alert:{alert.id}",
|
||||
86400, # 24 hours
|
||||
json.dumps(alert.to_dict())
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to store alert in Redis: {e}")
|
||||
|
||||
# Send notifications
|
||||
self.send_notifications(alert)
|
||||
|
||||
logger.warning(f"Alert triggered: {alert.title} ({alert.severity})")
|
||||
|
||||
def resolve_alert(self, alert_id: str, user: Optional[str] = None):
|
||||
"""Resolve alert."""
|
||||
if alert_id in self.active_alerts:
|
||||
alert = self.active_alerts[alert_id]
|
||||
alert.resolve(user)
|
||||
del self.active_alerts[alert_id]
|
||||
|
||||
# Update Redis
|
||||
if self.redis_client:
|
||||
try:
|
||||
self.redis_client.delete(f"alert:{alert_id}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to delete alert from Redis: {e}")
|
||||
|
||||
logger.info(f"Alert resolved: {alert.title}")
|
||||
|
||||
def acknowledge_alert(self, alert_id: str, user: str):
|
||||
"""Acknowledge alert."""
|
||||
if alert_id in self.active_alerts:
|
||||
alert = self.active_alerts[alert_id]
|
||||
alert.acknowledge(user)
|
||||
logger.info(f"Alert acknowledged: {alert.title} by {user}")
|
||||
|
||||
def get_active_alerts(self, severity: Optional[str] = None, category: Optional[str] = None) -> List[Alert]:
|
||||
"""Get active alerts with optional filtering."""
|
||||
alerts = list(self.active_alerts.values())
|
||||
|
||||
if severity:
|
||||
alerts = [a for a in alerts if a.severity == severity]
|
||||
|
||||
if category:
|
||||
alerts = [a for a in alerts if a.category == category]
|
||||
|
||||
return alerts
|
||||
|
||||
def get_alert_history(self, hours: int = 24) -> List[Alert]:
|
||||
"""Get alert history for specified hours."""
|
||||
since = timezone.now() - timedelta(hours=hours)
|
||||
return [a for a in self.alert_history if a.created_at >= since]
|
||||
|
||||
def send_notifications(self, alert: Alert):
|
||||
"""Send alert notifications."""
|
||||
for notifier in self.notifiers:
|
||||
try:
|
||||
if notifier.should_notify(alert):
|
||||
notifier.send(alert)
|
||||
except Exception as e:
|
||||
logger.error(f"Error sending notification via {notifier.__class__.__name__}: {e}")
|
||||
|
||||
# Alert condition methods
|
||||
def _check_high_cpu_usage(self) -> bool:
|
||||
"""Check for high CPU usage."""
|
||||
try:
|
||||
import psutil
|
||||
cpu_usage = psutil.cpu_percent(interval=1)
|
||||
return cpu_usage > 80
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def _check_critical_cpu_usage(self) -> bool:
|
||||
"""Check for critical CPU usage."""
|
||||
try:
|
||||
import psutil
|
||||
cpu_usage = psutil.cpu_percent(interval=1)
|
||||
return cpu_usage > 90
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def _check_database_connection_errors(self) -> bool:
|
||||
"""Check for database connection errors."""
|
||||
try:
|
||||
# This would integrate with your error tracking system
|
||||
# For now, return False as placeholder
|
||||
return False
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def _check_high_error_rate(self) -> bool:
|
||||
"""Check for high application error rate."""
|
||||
try:
|
||||
# This would check application error rates
|
||||
# For now, return False as placeholder
|
||||
return False
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def _check_low_active_users(self) -> bool:
|
||||
"""Check for low active users."""
|
||||
try:
|
||||
from django.contrib.auth import get_user_model
|
||||
User = get_user_model()
|
||||
five_minutes_ago = timezone.now() - timedelta(minutes=5)
|
||||
active_count = User.objects.filter(
|
||||
last_login__gte=five_minutes_ago,
|
||||
is_active=True
|
||||
).count()
|
||||
return active_count < 10
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def _check_malaysian_service_degradation(self) -> bool:
|
||||
"""Check for Malaysian service degradation."""
|
||||
try:
|
||||
# This would check Malaysian-specific service health
|
||||
# For now, return False as placeholder
|
||||
return False
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def _check_suspicious_login_activity(self) -> bool:
|
||||
"""Check for suspicious login activity."""
|
||||
try:
|
||||
# This would check for suspicious login patterns
|
||||
# For now, return False as placeholder
|
||||
return False
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
class EmailNotifier:
|
||||
"""Email notification system."""
|
||||
|
||||
def should_notify(self, alert: Alert) -> bool:
|
||||
"""Check if should send email notification."""
|
||||
# Send emails for warnings and above
|
||||
return alert.severity in [AlertSeverity.WARNING, AlertSeverity.ERROR, AlertSeverity.CRITICAL]
|
||||
|
||||
def send(self, alert: Alert):
|
||||
"""Send email notification."""
|
||||
try:
|
||||
subject = f"[{alert.severity.upper()}] {alert.title}"
|
||||
message = f"""
|
||||
Alert Details:
|
||||
- Title: {alert.title}
|
||||
- Severity: {alert.severity}
|
||||
- Category: {alert.category}
|
||||
- Description: {alert.description}
|
||||
- Time: {alert.created_at}
|
||||
- Source: {alert.source}
|
||||
- Tenant: {alert.tenant}
|
||||
|
||||
Additional Information:
|
||||
{json.dumps(alert.metadata, indent=2)}
|
||||
"""
|
||||
|
||||
send_mail(
|
||||
subject,
|
||||
message,
|
||||
settings.DEFAULT_FROM_EMAIL,
|
||||
settings.ALERT_EMAIL_RECIPIENTS,
|
||||
fail_silently=False
|
||||
)
|
||||
|
||||
logger.info(f"Email notification sent for alert: {alert.id}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to send email notification: {e}")
|
||||
|
||||
class SlackNotifier:
|
||||
"""Slack notification system."""
|
||||
|
||||
def should_notify(self, alert: Alert) -> bool:
|
||||
"""Check if should send Slack notification."""
|
||||
# Send Slack for all alerts
|
||||
return True
|
||||
|
||||
def send(self, alert: Alert):
|
||||
"""Send Slack notification."""
|
||||
try:
|
||||
webhook_url = settings.SLACK_WEBHOOK_URL
|
||||
|
||||
# Color based on severity
|
||||
colors = {
|
||||
AlertSeverity.INFO: '#36a64f',
|
||||
AlertSeverity.WARNING: '#ff9500',
|
||||
AlertSeverity.ERROR: '#ff0000',
|
||||
AlertSeverity.CRITICAL: '#990000'
|
||||
}
|
||||
|
||||
payload = {
|
||||
'text': f'{alert.severity.upper()}: {alert.title}',
|
||||
'attachments': [{
|
||||
'color': colors.get(alert.severity, '#36a64f'),
|
||||
'title': alert.title,
|
||||
'text': alert.description,
|
||||
'fields': [
|
||||
{'title': 'Severity', 'value': alert.severity, 'short': True},
|
||||
{'title': 'Category', 'value': alert.category, 'short': True},
|
||||
{'title': 'Time', 'value': alert.created_at.strftime('%Y-%m-%d %H:%M:%S'), 'short': True},
|
||||
{'title': 'Tenant', 'value': alert.tenant, 'short': True},
|
||||
],
|
||||
'footer': 'Malaysian SME Platform Alert System',
|
||||
'ts': int(alert.created_at.timestamp())
|
||||
}]
|
||||
}
|
||||
|
||||
response = requests.post(webhook_url, json=payload, timeout=10)
|
||||
response.raise_for_status()
|
||||
|
||||
logger.info(f"Slack notification sent for alert: {alert.id}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to send Slack notification: {e}")
|
||||
|
||||
class SMSNotifier:
|
||||
"""SMS notification system for critical alerts."""
|
||||
|
||||
def should_notify(self, alert: Alert) -> bool:
|
||||
"""Check if should send SMS notification."""
|
||||
# Only send SMS for critical alerts
|
||||
return alert.severity == AlertSeverity.CRITICAL
|
||||
|
||||
def send(self, alert: Alert):
|
||||
"""Send SMS notification."""
|
||||
try:
|
||||
# This would integrate with Malaysian SMS service
|
||||
# For now, just log the attempt
|
||||
logger.info(f"SMS notification would be sent for critical alert: {alert.id}")
|
||||
|
||||
# Example integration with Malaysian SMS service
|
||||
# sms_api_url = settings.SMS_API_URL
|
||||
# api_key = settings.SMS_API_KEY
|
||||
# recipients = settings.CRITICAL_ALERT_SMS_RECIPIENTS
|
||||
|
||||
# message = f"CRITICAL: {alert.title}. {alert.description[:100]}"
|
||||
# payload = {
|
||||
# 'api_key': api_key,
|
||||
# 'recipients': recipients,
|
||||
# 'message': message
|
||||
# }
|
||||
|
||||
# response = requests.post(sms_api_url, json=payload, timeout=10)
|
||||
# response.raise_for_status()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to send SMS notification: {e}")
|
||||
|
||||
# Global alert manager instance
|
||||
alert_manager = AlertManager()
|
||||
709
backend/monitoring/exporters.py
Normal file
709
backend/monitoring/exporters.py
Normal file
@@ -0,0 +1,709 @@
|
||||
"""
|
||||
Prometheus exporters for various system and application metrics.
|
||||
"""
|
||||
|
||||
import time
|
||||
import logging
|
||||
import threading
|
||||
from typing import Dict, Any, List, Optional
|
||||
from datetime import datetime, timedelta
|
||||
from django.db import connection, connections
|
||||
from django.core.cache import cache
|
||||
from django.conf import settings
|
||||
from django.contrib.auth import get_user_model
|
||||
from django.db.models import Count, Q, Avg
|
||||
from django.utils import timezone
|
||||
from django_tenants.utils import get_tenant_model, get_tenant_schema_name
|
||||
from prometheus_client import Gauge, Counter, Histogram, Info, start_http_server
|
||||
from prometheus_client.core import GaugeMetricFamily, CounterMetricFamily, REGISTRY
|
||||
import psutil
|
||||
import redis
|
||||
|
||||
from .middleware import (
|
||||
DATABASE_QUERIES, CACHE_HITS, CACHE_MISSES, MALAYSIAN_OPERATIONS,
|
||||
TENANT_METRICS, BUSINESS_METRICS, ERROR_EVENTS
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
User = get_user_model()
|
||||
TenantModel = get_tenant_model()
|
||||
|
||||
class DatabaseExporter:
|
||||
"""Exporter for database metrics."""
|
||||
|
||||
def __init__(self):
|
||||
self.metrics = {
|
||||
'database_size': Gauge(
|
||||
'database_size_bytes',
|
||||
'Database size in bytes',
|
||||
['database', 'tenant']
|
||||
),
|
||||
'database_connections': Gauge(
|
||||
'database_connections_current',
|
||||
'Current database connections',
|
||||
['state', 'tenant']
|
||||
),
|
||||
'database_transactions': Counter(
|
||||
'database_transactions_total',
|
||||
'Database transactions',
|
||||
['type', 'tenant']
|
||||
),
|
||||
'database_query_time': Histogram(
|
||||
'database_query_duration_seconds',
|
||||
'Database query duration',
|
||||
['query_type', 'tenant']
|
||||
),
|
||||
'database_deadlocks': Counter(
|
||||
'database_deadlocks_total',
|
||||
'Database deadlocks',
|
||||
['tenant']
|
||||
),
|
||||
'database_cache_hit_ratio': Gauge(
|
||||
'database_cache_hit_ratio',
|
||||
'Database cache hit ratio',
|
||||
['tenant']
|
||||
),
|
||||
}
|
||||
|
||||
def collect_metrics(self):
|
||||
"""Collect database metrics."""
|
||||
try:
|
||||
self._collect_database_size()
|
||||
self._collect_connection_metrics()
|
||||
self._collect_transaction_metrics()
|
||||
self._collect_performance_metrics()
|
||||
self._collect_deadlock_metrics()
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting database metrics: {e}")
|
||||
|
||||
def _collect_database_size(self):
|
||||
"""Collect database size metrics."""
|
||||
try:
|
||||
with connection.cursor() as cursor:
|
||||
cursor.execute("""
|
||||
SELECT datname, pg_database_size(datname) as size
|
||||
FROM pg_database
|
||||
WHERE datistemplate = false
|
||||
""")
|
||||
for row in cursor.fetchall():
|
||||
db_name, size = row
|
||||
self.metrics['database_size'].labels(
|
||||
database=db_name,
|
||||
tenant='all'
|
||||
).set(size)
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting database size: {e}")
|
||||
|
||||
def _collect_connection_metrics(self):
|
||||
"""Collect connection metrics."""
|
||||
try:
|
||||
with connection.cursor() as cursor:
|
||||
# Current connections
|
||||
cursor.execute("""
|
||||
SELECT state, COUNT(*)
|
||||
FROM pg_stat_activity
|
||||
WHERE pid <> pg_backend_pid()
|
||||
GROUP BY state
|
||||
""")
|
||||
for state, count in cursor.fetchall():
|
||||
self.metrics['database_connections'].labels(
|
||||
state=state or 'idle',
|
||||
tenant='all'
|
||||
).set(count)
|
||||
|
||||
# Max connections
|
||||
cursor.execute("SHOW max_connections")
|
||||
max_connections = cursor.fetchone()[0]
|
||||
self.metrics['database_connections'].labels(
|
||||
state='max',
|
||||
tenant='all'
|
||||
).set(max_connections)
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting connection metrics: {e}")
|
||||
|
||||
def _collect_transaction_metrics(self):
|
||||
"""Collect transaction metrics."""
|
||||
try:
|
||||
with connection.cursor() as cursor:
|
||||
cursor.execute("""
|
||||
SELECT datname, xact_commit, xact_rollback
|
||||
FROM pg_stat_database
|
||||
""")
|
||||
for db_name, commits, rollbacks in cursor.fetchall():
|
||||
self.metrics['database_transactions'].labels(
|
||||
type='commit',
|
||||
tenant=db_name
|
||||
)._value._value.set(commits)
|
||||
self.metrics['database_transactions'].labels(
|
||||
type='rollback',
|
||||
tenant=db_name
|
||||
)._value._value.set(rollbacks)
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting transaction metrics: {e}")
|
||||
|
||||
def _collect_performance_metrics(self):
|
||||
"""Collect performance metrics."""
|
||||
try:
|
||||
with connection.cursor() as cursor:
|
||||
# Query performance
|
||||
cursor.execute("""
|
||||
SELECT query, calls, total_time, mean_time, rows
|
||||
FROM pg_stat_statements
|
||||
ORDER BY total_time DESC
|
||||
LIMIT 100
|
||||
""")
|
||||
for query, calls, total_time, mean_time, rows in cursor.fetchall():
|
||||
query_type = self._classify_query(query)
|
||||
self.metrics['database_query_time'].labels(
|
||||
query_type=query_type,
|
||||
tenant='all'
|
||||
).observe(mean_time / 1000) # Convert to seconds
|
||||
|
||||
# Cache hit ratio
|
||||
cursor.execute("""
|
||||
SELECT sum(blks_hit) / (sum(blks_hit) + sum(blks_read)) as hit_ratio
|
||||
FROM pg_stat_database
|
||||
""")
|
||||
hit_ratio = cursor.fetchone()[0]
|
||||
if hit_ratio:
|
||||
self.metrics['database_cache_hit_ratio'].labels(
|
||||
tenant='all'
|
||||
).set(hit_ratio * 100)
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting performance metrics: {e}")
|
||||
|
||||
def _collect_deadlock_metrics(self):
|
||||
"""Collect deadlock metrics."""
|
||||
try:
|
||||
with connection.cursor() as cursor:
|
||||
cursor.execute("""
|
||||
SELECT datname, deadlocks
|
||||
FROM pg_stat_database
|
||||
""")
|
||||
for db_name, deadlocks in cursor.fetchall():
|
||||
if deadlocks > 0:
|
||||
self.metrics['database_deadlocks'].labels(
|
||||
tenant=db_name
|
||||
)._value._value.set(deadlocks)
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting deadlock metrics: {e}")
|
||||
|
||||
def _classify_query(self, query: str) -> str:
|
||||
"""Classify SQL query type."""
|
||||
query_upper = query.upper()
|
||||
if query_upper.startswith('SELECT'):
|
||||
return 'select'
|
||||
elif query_upper.startswith('INSERT'):
|
||||
return 'insert'
|
||||
elif query_upper.startswith('UPDATE'):
|
||||
return 'update'
|
||||
elif query_upper.startswith('DELETE'):
|
||||
return 'delete'
|
||||
elif query_upper.startswith('CREATE'):
|
||||
return 'ddl'
|
||||
elif query_upper.startswith('ALTER'):
|
||||
return 'ddl'
|
||||
elif query_upper.startswith('DROP'):
|
||||
return 'ddl'
|
||||
else:
|
||||
return 'other'
|
||||
|
||||
|
||||
class CacheExporter:
|
||||
"""Exporter for cache metrics."""
|
||||
|
||||
def __init__(self):
|
||||
self.metrics = {
|
||||
'cache_size': Gauge(
|
||||
'cache_size_bytes',
|
||||
'Cache size in bytes',
|
||||
['cache_type', 'tenant']
|
||||
),
|
||||
'cache_items': Gauge(
|
||||
'cache_items_total',
|
||||
'Total items in cache',
|
||||
['cache_type', 'tenant']
|
||||
),
|
||||
'cache_operations': Counter(
|
||||
'cache_operations_total',
|
||||
'Cache operations',
|
||||
['operation', 'cache_type', 'tenant']
|
||||
),
|
||||
'cache_hit_ratio': Gauge(
|
||||
'cache_hit_ratio_percent',
|
||||
'Cache hit ratio percentage',
|
||||
['cache_type', 'tenant']
|
||||
),
|
||||
'cache_evictions': Counter(
|
||||
'cache_evictions_total',
|
||||
'Cache evictions',
|
||||
['cache_type', 'tenant']
|
||||
),
|
||||
'cache_memory_usage': Gauge(
|
||||
'cache_memory_usage_bytes',
|
||||
'Cache memory usage',
|
||||
['cache_type', 'tenant']
|
||||
),
|
||||
}
|
||||
|
||||
def collect_metrics(self):
|
||||
"""Collect cache metrics."""
|
||||
try:
|
||||
self._collect_redis_metrics()
|
||||
self._collect_django_cache_metrics()
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting cache metrics: {e}")
|
||||
|
||||
def _collect_redis_metrics(self):
|
||||
"""Collect Redis metrics."""
|
||||
try:
|
||||
redis_client = redis.Redis.from_url(settings.REDIS_URL)
|
||||
info = redis_client.info()
|
||||
|
||||
# Memory usage
|
||||
self.metrics['cache_memory_usage'].labels(
|
||||
cache_type='redis',
|
||||
tenant='all'
|
||||
).set(info['used_memory'])
|
||||
|
||||
# Key count
|
||||
self.metrics['cache_items'].labels(
|
||||
cache_type='redis',
|
||||
tenant='all'
|
||||
).set(info['keyspace_hits'] + info['keyspace_misses'])
|
||||
|
||||
# Hit ratio
|
||||
total = info['keyspace_hits'] + info['keyspace_misses']
|
||||
if total > 0:
|
||||
hit_ratio = (info['keyspace_hits'] / total) * 100
|
||||
self.metrics['cache_hit_ratio'].labels(
|
||||
cache_type='redis',
|
||||
tenant='all'
|
||||
).set(hit_ratio)
|
||||
|
||||
# Operations
|
||||
self.metrics['cache_operations'].labels(
|
||||
operation='get',
|
||||
cache_type='redis',
|
||||
tenant='all'
|
||||
)._value._value.set(info['keyspace_hits'] + info['keyspace_misses'])
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting Redis metrics: {e}")
|
||||
|
||||
def _collect_django_cache_metrics(self):
|
||||
"""Collect Django cache metrics."""
|
||||
try:
|
||||
# Get Django cache stats
|
||||
cache_stats = cache.get_stats()
|
||||
|
||||
for backend_name, stats in cache_stats.items():
|
||||
if 'hits' in stats and 'misses' in stats:
|
||||
total = stats['hits'] + stats['misses']
|
||||
if total > 0:
|
||||
hit_ratio = (stats['hits'] / total) * 100
|
||||
self.metrics['cache_hit_ratio'].labels(
|
||||
cache_type='django',
|
||||
tenant='all'
|
||||
).set(hit_ratio)
|
||||
|
||||
self.metrics['cache_operations'].labels(
|
||||
operation='get',
|
||||
cache_type='django',
|
||||
tenant='all'
|
||||
)._value._value.set(total)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting Django cache metrics: {e}")
|
||||
|
||||
|
||||
class SystemExporter:
|
||||
"""Exporter for system metrics."""
|
||||
|
||||
def __init__(self):
|
||||
self.metrics = {
|
||||
'system_cpu_usage': Gauge(
|
||||
'system_cpu_usage_percent',
|
||||
'System CPU usage percentage'
|
||||
),
|
||||
'system_memory_usage': Gauge(
|
||||
'system_memory_usage_bytes',
|
||||
'System memory usage'
|
||||
),
|
||||
'system_memory_usage_percent': Gauge(
|
||||
'system_memory_usage_percent',
|
||||
'System memory usage percentage'
|
||||
),
|
||||
'system_disk_usage': Gauge(
|
||||
'system_disk_usage_bytes',
|
||||
'System disk usage',
|
||||
['device', 'mountpoint']
|
||||
),
|
||||
'system_disk_usage_percent': Gauge(
|
||||
'system_disk_usage_percent',
|
||||
'System disk usage percentage',
|
||||
['device', 'mountpoint']
|
||||
),
|
||||
'system_network_bytes': Counter(
|
||||
'system_network_bytes_total',
|
||||
'System network traffic',
|
||||
['direction', 'interface']
|
||||
),
|
||||
'system_load_average': Gauge(
|
||||
'system_load_average',
|
||||
'System load average',
|
||||
['period']
|
||||
),
|
||||
'system_uptime': Gauge(
|
||||
'system_uptime_seconds',
|
||||
'System uptime in seconds'
|
||||
),
|
||||
}
|
||||
|
||||
def collect_metrics(self):
|
||||
"""Collect system metrics."""
|
||||
try:
|
||||
self._collect_cpu_metrics()
|
||||
self._collect_memory_metrics()
|
||||
self._collect_disk_metrics()
|
||||
self._collect_network_metrics()
|
||||
self._collect_load_metrics()
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting system metrics: {e}")
|
||||
|
||||
def _collect_cpu_metrics(self):
|
||||
"""Collect CPU metrics."""
|
||||
try:
|
||||
cpu_percent = psutil.cpu_percent(interval=1)
|
||||
self.metrics['system_cpu_usage'].set(cpu_percent)
|
||||
|
||||
# Per-CPU usage
|
||||
cpu_times = psutil.cpu_times_percent(interval=1)
|
||||
for i, (cpu_id, percent) in enumerate(psutil.cpu_percent(interval=1, percpu=True)):
|
||||
self.metrics['system_cpu_usage'].labels(cpu=f'cpu_{i}').set(percent)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting CPU metrics: {e}")
|
||||
|
||||
def _collect_memory_metrics(self):
|
||||
"""Collect memory metrics."""
|
||||
try:
|
||||
memory = psutil.virtual_memory()
|
||||
self.metrics['system_memory_usage'].set(memory.used)
|
||||
self.metrics['system_memory_usage_percent'].set(memory.percent)
|
||||
|
||||
# Swap memory
|
||||
swap = psutil.swap_memory()
|
||||
self.metrics['system_memory_usage'].labels(type='swap').set(swap.used)
|
||||
self.metrics['system_memory_usage_percent'].labels(type='swap').set(swap.percent)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting memory metrics: {e}")
|
||||
|
||||
def _collect_disk_metrics(self):
|
||||
"""Collect disk metrics."""
|
||||
try:
|
||||
disk_usage = psutil.disk_usage('/')
|
||||
self.metrics['system_disk_usage'].labels(
|
||||
device='root',
|
||||
mountpoint='/'
|
||||
).set(disk_usage.used)
|
||||
self.metrics['system_disk_usage_percent'].labels(
|
||||
device='root',
|
||||
mountpoint='/'
|
||||
).set((disk_usage.used / disk_usage.total) * 100)
|
||||
|
||||
# Disk I/O
|
||||
disk_io = psutil.disk_io_counters()
|
||||
if disk_io:
|
||||
self.metrics['system_network_bytes'].labels(
|
||||
direction='read',
|
||||
interface='disk'
|
||||
)._value._value.set(disk_io.read_bytes)
|
||||
self.metrics['system_network_bytes'].labels(
|
||||
direction='write',
|
||||
interface='disk'
|
||||
)._value._value.set(disk_io.write_bytes)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting disk metrics: {e}")
|
||||
|
||||
def _collect_network_metrics(self):
|
||||
"""Collect network metrics."""
|
||||
try:
|
||||
net_io = psutil.net_io_counters()
|
||||
if net_io:
|
||||
self.metrics['system_network_bytes'].labels(
|
||||
direction='recv',
|
||||
interface='all'
|
||||
)._value._value.set(net_io.bytes_recv)
|
||||
self.metrics['system_network_bytes'].labels(
|
||||
direction='sent',
|
||||
interface='all'
|
||||
)._value._value.set(net_io.bytes_sent)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting network metrics: {e}")
|
||||
|
||||
def _collect_load_metrics(self):
|
||||
"""Collect load average metrics."""
|
||||
try:
|
||||
load_avg = psutil.getloadavg()
|
||||
self.metrics['system_load_average'].labels(period='1min').set(load_avg[0])
|
||||
self.metrics['system_load_average'].labels(period='5min').set(load_avg[1])
|
||||
self.metrics['system_load_average'].labels(period='15min').set(load_avg[2])
|
||||
|
||||
# System uptime
|
||||
self.metrics['system_uptime'].set(time.time() - psutil.boot_time())
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting load metrics: {e}")
|
||||
|
||||
|
||||
class BusinessExporter:
|
||||
"""Exporter for business metrics."""
|
||||
|
||||
def __init__(self):
|
||||
self.metrics = {
|
||||
'active_users': Gauge(
|
||||
'business_active_users',
|
||||
'Number of active users',
|
||||
['tenant', 'industry_type']
|
||||
),
|
||||
'user_registrations': Counter(
|
||||
'business_user_registrations_total',
|
||||
'User registrations',
|
||||
['tenant', 'period']
|
||||
),
|
||||
'revenue': Counter(
|
||||
'business_revenue_myr_total',
|
||||
'Revenue in Malaysian Ringgit',
|
||||
['tenant', 'industry_type']
|
||||
),
|
||||
'transactions': Counter(
|
||||
'business_transactions_total',
|
||||
'Business transactions',
|
||||
['status', 'tenant', 'payment_method']
|
||||
),
|
||||
'tenant_resource_usage': Gauge(
|
||||
'business_tenant_resource_usage_percent',
|
||||
'Tenant resource usage percentage',
|
||||
['tenant', 'resource_type']
|
||||
),
|
||||
'malaysian_specific': Counter(
|
||||
'business_malaysian_operations_total',
|
||||
'Malaysian-specific operations',
|
||||
['operation', 'state', 'tenant']
|
||||
),
|
||||
}
|
||||
|
||||
def collect_metrics(self):
|
||||
"""Collect business metrics."""
|
||||
try:
|
||||
self._collect_user_metrics()
|
||||
self._collect_revenue_metrics()
|
||||
self._collect_transaction_metrics()
|
||||
self._collect_tenant_metrics()
|
||||
self._collect_malaysian_metrics()
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting business metrics: {e}")
|
||||
|
||||
def _collect_user_metrics(self):
|
||||
"""Collect user metrics."""
|
||||
try:
|
||||
# Active users (last 5 minutes)
|
||||
five_minutes_ago = timezone.now() - timedelta(minutes=5)
|
||||
active_count = User.objects.filter(
|
||||
last_login__gte=five_minutes_ago,
|
||||
is_active=True
|
||||
).count()
|
||||
|
||||
self.metrics['active_users'].labels(
|
||||
tenant='all',
|
||||
industry_type='all'
|
||||
).set(active_count)
|
||||
|
||||
# User registrations by period
|
||||
today = timezone.now().date()
|
||||
week_ago = today - timedelta(days=7)
|
||||
month_ago = today - timedelta(days=30)
|
||||
|
||||
registrations_today = User.objects.filter(
|
||||
date_joined__date=today
|
||||
).count()
|
||||
|
||||
registrations_week = User.objects.filter(
|
||||
date_joined__date__gte=week_ago
|
||||
).count()
|
||||
|
||||
registrations_month = User.objects.filter(
|
||||
date_joined__date__gte=month_ago
|
||||
).count()
|
||||
|
||||
self.metrics['user_registrations'].labels(
|
||||
tenant='all',
|
||||
period='today'
|
||||
)._value._value.set(registrations_today)
|
||||
self.metrics['user_registrations'].labels(
|
||||
tenant='all',
|
||||
period='week'
|
||||
)._value._value.set(registrations_week)
|
||||
self.metrics['user_registrations'].labels(
|
||||
tenant='all',
|
||||
period='month'
|
||||
)._value._value.set(registrations_month)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting user metrics: {e}")
|
||||
|
||||
def _collect_revenue_metrics(self):
|
||||
"""Collect revenue metrics."""
|
||||
try:
|
||||
# This would integrate with your payment system
|
||||
# For now, we'll use placeholder values
|
||||
from core.models import Transaction
|
||||
|
||||
today = timezone.now().date()
|
||||
today_revenue = Transaction.objects.filter(
|
||||
created_at__date=today,
|
||||
status='completed'
|
||||
).aggregate(total=Sum('amount'))['total'] or 0
|
||||
|
||||
self.metrics['revenue'].labels(
|
||||
tenant='all',
|
||||
industry_type='all'
|
||||
)._value._value.set(today_revenue)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting revenue metrics: {e}")
|
||||
|
||||
def _collect_transaction_metrics(self):
|
||||
"""Collect transaction metrics."""
|
||||
try:
|
||||
from core.models import Transaction
|
||||
|
||||
# Transaction counts by status
|
||||
status_counts = Transaction.objects.values('status').annotate(
|
||||
count=Count('id')
|
||||
)
|
||||
|
||||
for item in status_counts:
|
||||
self.metrics['transactions'].labels(
|
||||
status=item['status'],
|
||||
tenant='all',
|
||||
payment_method='all'
|
||||
)._value._value.set(item['count'])
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting transaction metrics: {e}")
|
||||
|
||||
def _collect_tenant_metrics(self):
|
||||
"""Collect tenant metrics."""
|
||||
try:
|
||||
tenants = TenantModel.objects.all()
|
||||
|
||||
for tenant in tenants:
|
||||
# Tenant resource usage (placeholder)
|
||||
self.metrics['tenant_resource_usage'].labels(
|
||||
tenant=tenant.name,
|
||||
resource_type='storage'
|
||||
).set(50) # Placeholder value
|
||||
|
||||
# Tenant active users
|
||||
active_users = User.objects.filter(
|
||||
tenant=tenant,
|
||||
is_active=True,
|
||||
last_login__gte=timezone.now() - timedelta(minutes=30)
|
||||
).count()
|
||||
|
||||
self.metrics['active_users'].labels(
|
||||
tenant=tenant.name,
|
||||
industry_type=getattr(tenant, 'industry_type', 'general')
|
||||
).set(active_users)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting tenant metrics: {e}")
|
||||
|
||||
def _collect_malaysian_metrics(self):
|
||||
"""Collect Malaysian-specific metrics."""
|
||||
try:
|
||||
from core.models import MalaysianICValidation, SSTCalculation
|
||||
|
||||
# IC validations by state
|
||||
ic_validations = MalaysianICValidation.objects.values(
|
||||
'state'
|
||||
).annotate(count=Count('id'))
|
||||
|
||||
for item in ic_validations:
|
||||
self.metrics['malaysian_specific'].labels(
|
||||
operation='ic_validation',
|
||||
state=item['state'],
|
||||
tenant='all'
|
||||
)._value._value.set(item['count'])
|
||||
|
||||
# SST calculations
|
||||
sst_calculations = SSTCalculation.objects.count()
|
||||
self.metrics['malaysian_specific'].labels(
|
||||
operation='sst_calculation',
|
||||
state='all',
|
||||
tenant='all'
|
||||
)._value._value.set(sst_calculations)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting Malaysian metrics: {e}")
|
||||
|
||||
|
||||
class MetricsCollector:
|
||||
"""Main metrics collector that runs all exporters."""
|
||||
|
||||
def __init__(self):
|
||||
self.exporters = {
|
||||
'database': DatabaseExporter(),
|
||||
'cache': CacheExporter(),
|
||||
'system': SystemExporter(),
|
||||
'business': BusinessExporter(),
|
||||
}
|
||||
self.running = False
|
||||
self.thread = None
|
||||
|
||||
def start_collection(self, interval: int = 30):
|
||||
"""Start metrics collection in background thread."""
|
||||
if not self.running:
|
||||
self.running = True
|
||||
self.thread = threading.Thread(target=self._collect_loop, args=(interval,))
|
||||
self.thread.daemon = True
|
||||
self.thread.start()
|
||||
logger.info("Metrics collection started")
|
||||
|
||||
def stop_collection(self):
|
||||
"""Stop metrics collection."""
|
||||
self.running = False
|
||||
if self.thread:
|
||||
self.thread.join()
|
||||
logger.info("Metrics collection stopped")
|
||||
|
||||
def _collect_loop(self, interval: int):
|
||||
"""Main collection loop."""
|
||||
while self.running:
|
||||
try:
|
||||
for name, exporter in self.exporters.items():
|
||||
logger.debug(f"Collecting {name} metrics...")
|
||||
exporter.collect_metrics()
|
||||
|
||||
time.sleep(interval)
|
||||
except Exception as e:
|
||||
logger.error(f"Error in metrics collection loop: {e}")
|
||||
time.sleep(interval)
|
||||
|
||||
def collect_once(self):
|
||||
"""Collect metrics once (for testing)."""
|
||||
for name, exporter in self.exporters.items():
|
||||
try:
|
||||
logger.debug(f"Collecting {name} metrics...")
|
||||
exporter.collect_metrics()
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting {name} metrics: {e}")
|
||||
|
||||
|
||||
# Global metrics collector instance
|
||||
metrics_collector = MetricsCollector()
|
||||
@@ -0,0 +1,132 @@
|
||||
"""
|
||||
Django management command to start metrics collection.
|
||||
"""
|
||||
|
||||
import time
|
||||
import signal
|
||||
import sys
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.conf import settings
|
||||
|
||||
from ..exporters import metrics_collector
|
||||
from ..alerts import alert_manager
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = 'Start metrics collection and alert monitoring'
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument(
|
||||
'--interval',
|
||||
type=int,
|
||||
default=30,
|
||||
help='Metrics collection interval in seconds (default: 30)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--alert-interval',
|
||||
type=int,
|
||||
default=60,
|
||||
help='Alert checking interval in seconds (default: 60)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--port',
|
||||
type=int,
|
||||
default=8001,
|
||||
help='Metrics server port (default: 8001)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--metrics-only',
|
||||
action='store_true',
|
||||
help='Only collect metrics, no alerts'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--alerts-only',
|
||||
action='store_true',
|
||||
help='Only check alerts, no metrics collection'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--quiet',
|
||||
action='store_true',
|
||||
help='Run quietly'
|
||||
)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
self.interval = options['interval']
|
||||
self.alert_interval = options['alert_interval']
|
||||
self.port = options['port']
|
||||
self.metrics_only = options['metrics_only']
|
||||
self.alerts_only = options['alerts_only']
|
||||
self.quiet = options['quiet']
|
||||
|
||||
# Set up signal handlers for graceful shutdown
|
||||
signal.signal(signal.SIGINT, self.signal_handler)
|
||||
signal.signal(signal.SIGTERM, self.signal_handler)
|
||||
|
||||
self.running = True
|
||||
|
||||
if not self.quiet:
|
||||
self.stdout.write(
|
||||
self.style.SUCCESS('Starting metrics collection and alert monitoring...')
|
||||
)
|
||||
self.stdout.write(f'Metrics interval: {self.interval} seconds')
|
||||
self.stdout.write(f'Alert interval: {self.alert_interval} seconds')
|
||||
|
||||
try:
|
||||
# Start metrics collection
|
||||
if not self.alerts_only:
|
||||
if not self.quiet:
|
||||
self.stdout.write('Starting metrics collection...')
|
||||
metrics_collector.start_collection(self.interval)
|
||||
|
||||
# Start alert monitoring
|
||||
if not self.metrics_only:
|
||||
if not self.quiet:
|
||||
self.stdout.write('Starting alert monitoring...')
|
||||
self.start_alert_monitoring()
|
||||
|
||||
# Keep the command running
|
||||
if not self.quiet:
|
||||
self.stdout.write('Monitoring started. Press Ctrl+C to stop.')
|
||||
|
||||
while self.running:
|
||||
time.sleep(1)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
if not self.quiet:
|
||||
self.stdout.write('\nReceived interrupt signal, stopping...')
|
||||
finally:
|
||||
self.shutdown()
|
||||
|
||||
def start_alert_monitoring(self):
|
||||
"""Start alert monitoring in a separate thread."""
|
||||
import threading
|
||||
|
||||
def alert_monitor():
|
||||
while self.running:
|
||||
try:
|
||||
alert_manager.check_rules()
|
||||
time.sleep(self.alert_interval)
|
||||
except Exception as e:
|
||||
if not self.quiet:
|
||||
self.stdout.write(
|
||||
self.style.ERROR(f'Error in alert monitoring: {e}')
|
||||
)
|
||||
time.sleep(self.alert_interval)
|
||||
|
||||
alert_thread = threading.Thread(target=alert_monitor, daemon=True)
|
||||
alert_thread.start()
|
||||
|
||||
def signal_handler(self, signum, frame):
|
||||
"""Handle shutdown signals."""
|
||||
if not self.quiet:
|
||||
self.stdout.write(f'\nReceived signal {signum}, shutting down...')
|
||||
self.running = False
|
||||
|
||||
def shutdown(self):
|
||||
"""Shutdown the monitoring system."""
|
||||
if not self.quiet:
|
||||
self.stdout.write('Shutting down metrics collection...')
|
||||
metrics_collector.stop_collection()
|
||||
|
||||
if not self.quiet:
|
||||
self.stdout.write(self.style.SUCCESS('Monitoring stopped.'))
|
||||
512
backend/monitoring/middleware.py
Normal file
512
backend/monitoring/middleware.py
Normal file
@@ -0,0 +1,512 @@
|
||||
"""
|
||||
Django middleware for application monitoring and metrics collection.
|
||||
"""
|
||||
|
||||
import time
|
||||
import logging
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from typing import Dict, Any, Optional
|
||||
from django.conf import settings
|
||||
from django.http import HttpRequest, HttpResponse
|
||||
from django.contrib.auth import get_user_model
|
||||
from django.db import connection
|
||||
from django.utils import timezone
|
||||
from django_tenants.utils import get_tenant_model
|
||||
from prometheus_client import Counter, Histogram, Gauge, Info, CollectorRegistry, generate_latest
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
User = get_user_model()
|
||||
TenantModel = get_tenant_model()
|
||||
|
||||
# Prometheus metrics
|
||||
REQUEST_COUNT = Counter(
|
||||
'http_requests_total',
|
||||
'Total HTTP requests',
|
||||
['method', 'endpoint', 'status', 'tenant']
|
||||
)
|
||||
|
||||
REQUEST_DURATION = Histogram(
|
||||
'http_request_duration_seconds',
|
||||
'HTTP request duration',
|
||||
['method', 'endpoint', 'tenant'],
|
||||
buckets=[0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0]
|
||||
)
|
||||
|
||||
ACTIVE_USERS = Gauge(
|
||||
'active_users_total',
|
||||
'Number of active users'
|
||||
)
|
||||
|
||||
DATABASE_CONNECTIONS = Gauge(
|
||||
'database_connections_total',
|
||||
'Database connections',
|
||||
['state']
|
||||
)
|
||||
|
||||
DATABASE_QUERIES = Counter(
|
||||
'database_queries_total',
|
||||
'Database queries executed',
|
||||
['type', 'table', 'tenant']
|
||||
|
||||
CACHE_OPERATIONS = Counter(
|
||||
'cache_operations_total',
|
||||
'Cache operations',
|
||||
['operation', 'cache_type', 'tenant']
|
||||
)
|
||||
|
||||
CACHE_HITS = Counter(
|
||||
'cache_hits_total',
|
||||
'Cache hits',
|
||||
['cache_type', 'tenant']
|
||||
)
|
||||
|
||||
CACHE_MISSES = Counter(
|
||||
'cache_misses_total',
|
||||
'Cache misses',
|
||||
['cache_type', 'tenant']
|
||||
)
|
||||
|
||||
MALAYSIAN_OPERATIONS = Counter(
|
||||
'malaysian_operations_total',
|
||||
'Malaysian-specific operations',
|
||||
['operation', 'type']
|
||||
)
|
||||
|
||||
TENANT_METRICS = Gauge(
|
||||
'tenant_metrics',
|
||||
'Tenant-specific metrics',
|
||||
['tenant_id', 'metric_type'],
|
||||
['tenant_name', 'industry_type']
|
||||
)
|
||||
|
||||
AUTH_EVENTS = Counter(
|
||||
'auth_events_total',
|
||||
'Authentication events',
|
||||
['event_type', 'result', 'tenant']
|
||||
)
|
||||
|
||||
ERROR_EVENTS = Counter(
|
||||
'error_events_total',
|
||||
'Application errors',
|
||||
['error_type', 'severity', 'tenant']
|
||||
)
|
||||
|
||||
BUSINESS_METRICS = Counter(
|
||||
'business_events_total',
|
||||
'Business events',
|
||||
['event_type', 'tenant']
|
||||
)
|
||||
|
||||
SLO_METRICS = Histogram(
|
||||
'slo_metrics',
|
||||
'Service Level Objective metrics',
|
||||
['slo_name', 'tenant']
|
||||
)
|
||||
|
||||
# Application info
|
||||
APP_INFO = Info('application_info', 'Application information')
|
||||
APP_INFO.info({
|
||||
'version': getattr(settings, 'VERSION', '1.0.0'),
|
||||
'environment': getattr(settings, 'ENVIRONMENT', 'development'),
|
||||
'django_version': settings.VERSION,
|
||||
'python_version': settings.PYTHON_VERSION,
|
||||
'malaysian_sme_platform': 'true'
|
||||
})
|
||||
|
||||
class MonitoringMiddleware:
|
||||
"""Middleware for comprehensive application monitoring."""
|
||||
|
||||
def __init__(self, get_response):
|
||||
self.get_response = get_response
|
||||
self.registry = CollectorRegistry()
|
||||
|
||||
def __call__(self, request: HttpRequest) -> HttpResponse:
|
||||
# Generate request ID for tracing
|
||||
request_id = str(uuid.uuid4())
|
||||
request.request_id = request_id
|
||||
|
||||
# Start timing
|
||||
start_time = time.time()
|
||||
|
||||
# Get tenant info
|
||||
tenant_info = self._get_tenant_info(request)
|
||||
|
||||
# Log request start
|
||||
self._log_request_start(request, tenant_info)
|
||||
|
||||
# Execute request
|
||||
response = self.get_response(request)
|
||||
|
||||
# Calculate metrics
|
||||
duration = time.time() - start_time
|
||||
endpoint = self._get_endpoint(request)
|
||||
status_code = str(response.status_code)
|
||||
|
||||
# Record metrics
|
||||
self._record_request_metrics(request, response, duration, endpoint, tenant_info)
|
||||
self._record_business_metrics(request, response, tenant_info)
|
||||
self._record_slo_metrics(request, response, duration, tenant_info)
|
||||
|
||||
# Add monitoring headers
|
||||
self._add_monitoring_headers(response, request_id, duration)
|
||||
|
||||
return response
|
||||
|
||||
def process_exception(self, request: HttpRequest, exception: Exception) -> Optional[HttpResponse]:
|
||||
"""Process exceptions and record error metrics."""
|
||||
tenant_info = self._get_tenant_info(request)
|
||||
|
||||
ERROR_EVENTS.labels(
|
||||
error_type=exception.__class__.__name__,
|
||||
severity='error',
|
||||
tenant=tenant_info.get('name', 'unknown')
|
||||
).inc()
|
||||
|
||||
logger.error(f"Request {request.request_id} failed: {exception}", exc_info=True)
|
||||
return None
|
||||
|
||||
def process_template_response(self, request: HttpRequest, response: HttpResponse) -> HttpResponse:
|
||||
"""Process template responses for additional metrics."""
|
||||
# Add performance metrics to template context
|
||||
if hasattr(request, 'tenant'):
|
||||
response.context_data = response.context_data or {}
|
||||
response.context_data.update({
|
||||
'performance_metrics': self._get_performance_metrics(request),
|
||||
'tenant_metrics': self._get_tenant_metrics(request),
|
||||
})
|
||||
return response
|
||||
|
||||
def _get_tenant_info(self, request: HttpRequest) -> Dict[str, Any]:
|
||||
"""Get tenant information from request."""
|
||||
tenant_info = {'id': None, 'name': 'public', 'schema': 'public'}
|
||||
|
||||
if hasattr(request, 'tenant') and request.tenant:
|
||||
tenant_info = {
|
||||
'id': request.tenant.id,
|
||||
'name': request.tenant.name,
|
||||
'schema': request.tenant.schema_name,
|
||||
'domain': getattr(request.tenant, 'domain_url', ''),
|
||||
'industry_type': getattr(request.tenant, 'industry_type', 'general')
|
||||
}
|
||||
|
||||
return tenant_info
|
||||
|
||||
def _log_request_start(self, request: HttpRequest, tenant_info: Dict[str, Any]):
|
||||
"""Log request start information."""
|
||||
logger.info(
|
||||
f"Request started: {request.method} {request.path} "
|
||||
f"(ID: {request.request_id}, Tenant: {tenant_info['name']}, "
|
||||
f"User: {request.user if request.user.is_authenticated else 'anonymous'})"
|
||||
)
|
||||
|
||||
def _get_endpoint(self, request: HttpRequest) -> str:
|
||||
"""Extract endpoint name from request."""
|
||||
try:
|
||||
if hasattr(request.resolver_match, 'view_name'):
|
||||
return request.resolver_match.view_name or request.path
|
||||
return request.path
|
||||
except AttributeError:
|
||||
return request.path
|
||||
|
||||
def _record_request_metrics(self, request: HttpRequest, response: HttpResponse,
|
||||
duration: float, endpoint: str, tenant_info: Dict[str, Any]):
|
||||
"""Record HTTP request metrics."""
|
||||
tenant_name = tenant_info.get('name', 'unknown')
|
||||
|
||||
# Record request count and duration
|
||||
REQUEST_COUNT.labels(
|
||||
method=request.method,
|
||||
endpoint=endpoint,
|
||||
status=response.status_code,
|
||||
tenant=tenant_name
|
||||
).inc()
|
||||
|
||||
REQUEST_DURATION.labels(
|
||||
method=request.method,
|
||||
endpoint=endpoint,
|
||||
tenant=tenant_name
|
||||
).observe(duration)
|
||||
|
||||
# Record database queries if available
|
||||
if hasattr(connection, 'queries') and connection.queries:
|
||||
for query in connection.queries:
|
||||
DATABASE_QUERIES.labels(
|
||||
type='select' if query['sql'].upper().startswith('SELECT') else 'other',
|
||||
table=self._extract_table_name(query['sql']),
|
||||
tenant=tenant_name
|
||||
).inc()
|
||||
|
||||
# Log slow requests
|
||||
if duration > 2.0: # 2 seconds threshold
|
||||
logger.warning(
|
||||
f"Slow request detected: {request.method} {request.path} "
|
||||
f"(Duration: {duration:.2f}s, Tenant: {tenant_name})"
|
||||
)
|
||||
|
||||
def _record_business_metrics(self, request: HttpRequest, response: HttpResponse,
|
||||
tenant_info: Dict[str, Any]):
|
||||
"""Record business-specific metrics."""
|
||||
tenant_name = tenant_info.get('name', 'unknown')
|
||||
|
||||
# Track user activity
|
||||
if request.user.is_authenticated:
|
||||
BUSINESS_METRICS.labels(
|
||||
event_type='user_activity',
|
||||
tenant=tenant_name
|
||||
).inc()
|
||||
|
||||
# Track Malaysian-specific operations
|
||||
if self._is_malaysian_endpoint(request.path):
|
||||
BUSINESS_METRICS.labels(
|
||||
event_type='malaysian_operation',
|
||||
tenant=tenant_name
|
||||
).inc()
|
||||
|
||||
# Track API calls
|
||||
if request.path.startswith('/api/'):
|
||||
BUSINESS_METRICS.labels(
|
||||
event_type='api_call',
|
||||
tenant=tenant_name
|
||||
).inc()
|
||||
|
||||
def _record_slo_metrics(self, request: HttpRequest, response: HttpResponse,
|
||||
duration: float, tenant_info: Dict[str, Any]):
|
||||
"""Record Service Level Objective metrics."""
|
||||
tenant_name = tenant_info.get('name', 'unknown')
|
||||
|
||||
# Availability SLO
|
||||
SLO_METRICS.labels(
|
||||
slo_name='availability',
|
||||
tenant=tenant_name
|
||||
).observe(1.0 if response.status_code < 500 else 0.0)
|
||||
|
||||
# Latency SLO
|
||||
latency_slo_value = 1.0 if duration <= 2.0 else max(0.0, 1.0 - (duration - 2.0) / 8.0)
|
||||
SLO_METRICS.labels(
|
||||
slo_name='latency',
|
||||
tenant=tenant_name
|
||||
).observe(latency_slo_value)
|
||||
|
||||
def _add_monitoring_headers(self, response: HttpResponse, request_id: str, duration: float):
|
||||
"""Add monitoring headers to response."""
|
||||
response['X-Request-ID'] = request_id
|
||||
response['X-Response-Time'] = f"{duration:.3f}s"
|
||||
response['X-Monitoring-Timestamp'] = timezone.now().isoformat()
|
||||
|
||||
def _extract_table_name(self, sql: str) -> str:
|
||||
"""Extract table name from SQL query."""
|
||||
try:
|
||||
sql_upper = sql.upper()
|
||||
if 'FROM ' in sql_upper:
|
||||
from_part = sql_upper.split('FROM ')[1]
|
||||
table_name = from_part.split()[0].strip('"[]')
|
||||
return table_name
|
||||
except Exception:
|
||||
pass
|
||||
return 'unknown'
|
||||
|
||||
def _is_malaysian_endpoint(self, path: str) -> bool:
|
||||
"""Check if endpoint is Malaysian-specific."""
|
||||
malaysian_endpoints = [
|
||||
'/api/malaysian/',
|
||||
'/api/sst/',
|
||||
'/api/ic-validation/',
|
||||
'/api/postcode/',
|
||||
]
|
||||
return any(path.startswith(endpoint) for endpoint in malaysian_endpoints)
|
||||
|
||||
def _get_performance_metrics(self, request: HttpRequest) -> Dict[str, Any]:
|
||||
"""Get performance metrics for template context."""
|
||||
return {
|
||||
'response_time': getattr(request, 'response_time', 0),
|
||||
'database_queries': len(getattr(connection, 'queries', [])),
|
||||
'cache_hits': getattr(request, 'cache_hits', 0),
|
||||
'cache_misses': getattr(request, 'cache_misses', 0),
|
||||
}
|
||||
|
||||
def _get_tenant_metrics(self, request: HttpRequest) -> Dict[str, Any]:
|
||||
"""Get tenant metrics for template context."""
|
||||
if hasattr(request, 'tenant') and request.tenant:
|
||||
return {
|
||||
'tenant_name': request.tenant.name,
|
||||
'tenant_users': request.tenant.users.count(),
|
||||
'tenant_industry': getattr(request.tenant, 'industry_type', 'general'),
|
||||
'tenant_domain': getattr(request.tenant, 'domain_url', ''),
|
||||
}
|
||||
return {}
|
||||
|
||||
|
||||
class DatabaseMonitoringMiddleware:
|
||||
"""Middleware for database monitoring."""
|
||||
|
||||
def __init__(self, get_response):
|
||||
self.get_response = get_response
|
||||
|
||||
def __call__(self, request: HttpRequest) -> HttpResponse:
|
||||
# Reset query tracking
|
||||
initial_queries = len(getattr(connection, 'queries', []))
|
||||
|
||||
response = self.get_response(request)
|
||||
|
||||
# Calculate query metrics
|
||||
final_queries = len(getattr(connection, 'queries', []))
|
||||
queries_executed = final_queries - initial_queries
|
||||
|
||||
# Update metrics
|
||||
if hasattr(request, 'tenant'):
|
||||
tenant_name = request.tenant.name
|
||||
DATABASE_QUERIES.labels(
|
||||
type='total',
|
||||
table='all',
|
||||
tenant=tenant_name
|
||||
).inc(queries_executed)
|
||||
|
||||
return response
|
||||
|
||||
|
||||
class CacheMonitoringMiddleware:
|
||||
"""Middleware for cache monitoring."""
|
||||
|
||||
def __init__(self, get_response):
|
||||
self.get_response = get_response
|
||||
|
||||
def __call__(self, request: HttpRequest) -> HttpResponse:
|
||||
# Initialize cache metrics
|
||||
request.cache_hits = 0
|
||||
request.cache_misses = 0
|
||||
|
||||
response = self.get_response(request)
|
||||
|
||||
# Record cache metrics
|
||||
if hasattr(request, 'tenant'):
|
||||
tenant_name = request.tenant.name
|
||||
CACHE_HITS.labels(
|
||||
cache_type='django',
|
||||
tenant=tenant_name
|
||||
).inc(request.cache_hits)
|
||||
|
||||
CACHE_MISSES.labels(
|
||||
cache_type='django',
|
||||
tenant=tenant_name
|
||||
).inc(request.cache_misses)
|
||||
|
||||
return response
|
||||
|
||||
|
||||
class SecurityMonitoringMiddleware:
|
||||
"""Middleware for security monitoring."""
|
||||
|
||||
def __init__(self, get_response):
|
||||
self.get_response = get_response
|
||||
|
||||
def __call__(self, request: HttpRequest) -> HttpResponse:
|
||||
# Security checks before request
|
||||
self._check_security_headers(request)
|
||||
self._check_rate_limits(request)
|
||||
|
||||
response = self.get_response(request)
|
||||
|
||||
# Security monitoring after response
|
||||
self._monitor_security_events(request, response)
|
||||
|
||||
return response
|
||||
|
||||
def _check_security_headers(self, request: HttpRequest):
|
||||
"""Check for security-related headers."""
|
||||
# Check for suspicious user agents
|
||||
user_agent = request.META.get('HTTP_USER_AGENT', '')
|
||||
suspicious_agents = ['sqlmap', 'nikto', 'nmap', 'curl', 'wget']
|
||||
if any(agent in user_agent.lower() for agent in suspicious_agents):
|
||||
ERROR_EVENTS.labels(
|
||||
error_type='suspicious_user_agent',
|
||||
severity='warning',
|
||||
tenant='unknown'
|
||||
).inc()
|
||||
|
||||
def _check_rate_limits(self, request: HttpRequest):
|
||||
"""Check rate limits."""
|
||||
ip_address = self._get_client_ip(request)
|
||||
# Implement rate limiting logic here
|
||||
# This would typically use Redis to track request rates
|
||||
|
||||
def _monitor_security_events(self, request: HttpRequest, response: HttpResponse):
|
||||
"""Monitor security-related events."""
|
||||
# Monitor for authentication attempts
|
||||
if '/login' in request.path:
|
||||
if response.status_code == 200:
|
||||
AUTH_EVENTS.labels(
|
||||
event_type='login_attempt',
|
||||
result='success',
|
||||
tenant=getattr(request, 'tenant', {}).get('name', 'unknown')
|
||||
).inc()
|
||||
else:
|
||||
AUTH_EVENTS.labels(
|
||||
event_type='login_attempt',
|
||||
result='failed',
|
||||
tenant=getattr(request, 'tenant', {}).get('name', 'unknown')
|
||||
).inc()
|
||||
|
||||
# Monitor for SQL injection attempts
|
||||
if self._detect_sql_injection(request):
|
||||
ERROR_EVENTS.labels(
|
||||
error_type='sql_injection_attempt',
|
||||
severity='critical',
|
||||
tenant=getattr(request, 'tenant', {}).get('name', 'unknown')
|
||||
).inc()
|
||||
|
||||
def _get_client_ip(self, request: HttpRequest) -> str:
|
||||
"""Get client IP address."""
|
||||
x_forwarded_for = request.META.get('HTTP_X_FORWARDED_FOR')
|
||||
if x_forwarded_for:
|
||||
ip = x_forwarded_for.split(',')[0]
|
||||
else:
|
||||
ip = request.META.get('REMOTE_ADDR')
|
||||
return ip
|
||||
|
||||
def _detect_sql_injection(self, request: HttpRequest) -> bool:
|
||||
"""Detect potential SQL injection attempts."""
|
||||
sql_injection_patterns = [
|
||||
"' OR '1'='1",
|
||||
"DROP TABLE",
|
||||
"UNION SELECT",
|
||||
"INSERT INTO",
|
||||
"UPDATE SET",
|
||||
"DELETE FROM",
|
||||
"--",
|
||||
"/*",
|
||||
"*/"
|
||||
]
|
||||
|
||||
# Check GET parameters
|
||||
for value in request.GET.values():
|
||||
if any(pattern.upper() in value.upper() for pattern in sql_injection_patterns):
|
||||
return True
|
||||
|
||||
# Check POST data
|
||||
if hasattr(request, 'POST'):
|
||||
for value in request.POST.values():
|
||||
if any(pattern.upper() in value.upper() for pattern in sql_injection_patterns):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
class MetricsView:
|
||||
"""View for exposing Prometheus metrics."""
|
||||
|
||||
def __init__(self):
|
||||
self.registry = CollectorRegistry()
|
||||
|
||||
def get_metrics(self) -> str:
|
||||
"""Get all metrics in Prometheus format."""
|
||||
return generate_latest(self.registry).decode('utf-8')
|
||||
|
||||
def get_health_metrics(self) -> Dict[str, Any]:
|
||||
"""Get health metrics."""
|
||||
return {
|
||||
'application_info': APP_INFO.info,
|
||||
'active_users': ACTIVE_USERS._value.get(),
|
||||
'database_connections': DATABASE_CONNECTIONS._value.get(),
|
||||
'timestamp': timezone.now().isoformat(),
|
||||
}
|
||||
481
backend/monitoring/views.py
Normal file
481
backend/monitoring/views.py
Normal file
@@ -0,0 +1,481 @@
|
||||
"""
|
||||
Django views for monitoring and metrics endpoints.
|
||||
"""
|
||||
|
||||
import json
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, Any, Optional
|
||||
from django.http import JsonResponse, HttpResponse
|
||||
from django.views.generic import TemplateView
|
||||
from django.contrib.auth.mixins import LoginRequiredMixin
|
||||
from django.db import connection
|
||||
from django.utils import timezone
|
||||
from django.conf import settings
|
||||
from rest_framework.views import APIView
|
||||
from rest_framework.response import Response
|
||||
from rest_framework import status
|
||||
from prometheus_client import generate_latest, REGISTRY, CONTENT_TYPE_LATEST
|
||||
from prometheus_client.parser import text_string_to_metric_families
|
||||
|
||||
from .middleware import MetricsView
|
||||
from .exporters import metrics_collector
|
||||
from .alerts import alert_manager, Alert, AlertSeverity, AlertCategory
|
||||
|
||||
class MetricsView(APIView):
|
||||
"""View for Prometheus metrics endpoint."""
|
||||
|
||||
def get(self, request):
|
||||
"""Get Prometheus metrics."""
|
||||
try:
|
||||
# Collect current metrics
|
||||
metrics_collector.collect_once()
|
||||
|
||||
# Generate Prometheus format
|
||||
metrics_data = generate_latest(REGISTRY)
|
||||
return HttpResponse(
|
||||
metrics_data,
|
||||
content_type=CONTENT_TYPE_LATEST
|
||||
)
|
||||
except Exception as e:
|
||||
return JsonResponse(
|
||||
{'error': f'Failed to generate metrics: {str(e)}'},
|
||||
status=status.HTTP_500_INTERNAL_SERVER_ERROR
|
||||
)
|
||||
|
||||
class HealthCheckView(APIView):
|
||||
"""Health check endpoint."""
|
||||
|
||||
def get(self, request):
|
||||
"""Comprehensive health check."""
|
||||
try:
|
||||
health_status = {
|
||||
'status': 'healthy',
|
||||
'timestamp': timezone.now().isoformat(),
|
||||
'version': getattr(settings, 'VERSION', '1.0.0'),
|
||||
'environment': getattr(settings, 'ENVIRONMENT', 'development'),
|
||||
'checks': {}
|
||||
}
|
||||
|
||||
# Database health
|
||||
try:
|
||||
with connection.cursor() as cursor:
|
||||
cursor.execute("SELECT 1")
|
||||
cursor.fetchone()
|
||||
health_status['checks']['database'] = {
|
||||
'status': 'healthy',
|
||||
'response_time': self._measure_response_time(self._check_database)
|
||||
}
|
||||
except Exception as e:
|
||||
health_status['checks']['database'] = {
|
||||
'status': 'unhealthy',
|
||||
'error': str(e)
|
||||
}
|
||||
health_status['status'] = 'degraded'
|
||||
|
||||
# Cache health
|
||||
try:
|
||||
from django.core.cache import cache
|
||||
cache.set('health_check', 'test', 1)
|
||||
cache.get('health_check')
|
||||
health_status['checks']['cache'] = {
|
||||
'status': 'healthy',
|
||||
'response_time': self._measure_response_time(self._check_cache)
|
||||
}
|
||||
except Exception as e:
|
||||
health_status['checks']['cache'] = {
|
||||
'status': 'unhealthy',
|
||||
'error': str(e)
|
||||
}
|
||||
health_status['status'] = 'degraded'
|
||||
|
||||
# Storage health
|
||||
try:
|
||||
storage_health = self._check_storage()
|
||||
health_status['checks']['storage'] = storage_health
|
||||
if storage_health['status'] != 'healthy':
|
||||
health_status['status'] = 'degraded'
|
||||
except Exception as e:
|
||||
health_status['checks']['storage'] = {
|
||||
'status': 'unhealthy',
|
||||
'error': str(e)
|
||||
}
|
||||
health_status['status'] = 'degraded'
|
||||
|
||||
# External services health
|
||||
external_services = self._check_external_services()
|
||||
health_status['checks']['external_services'] = external_services
|
||||
if any(service['status'] != 'healthy' for service in external_services.values()):
|
||||
health_status['status'] = 'degraded'
|
||||
|
||||
# Malaysian services health
|
||||
malaysian_services = self._check_malaysian_services()
|
||||
health_status['checks']['malaysian_services'] = malaysian_services
|
||||
if any(service['status'] != 'healthy' for service in malaysian_services.values()):
|
||||
health_status['status'] = 'degraded'
|
||||
|
||||
# Determine HTTP status code
|
||||
if health_status['status'] == 'healthy':
|
||||
http_status = status.HTTP_200_OK
|
||||
elif health_status['status'] == 'degraded':
|
||||
http_status = status.HTTP_503_SERVICE_UNAVAILABLE
|
||||
else:
|
||||
http_status = status.HTTP_500_INTERNAL_SERVER_ERROR
|
||||
|
||||
return Response(health_status, status=http_status)
|
||||
|
||||
except Exception as e:
|
||||
return Response(
|
||||
{'status': 'unhealthy', 'error': str(e)},
|
||||
status=status.HTTP_500_INTERNAL_SERVER_ERROR
|
||||
)
|
||||
|
||||
def _measure_response_time(self, func) -> float:
|
||||
"""Measure response time of a function."""
|
||||
import time
|
||||
start_time = time.time()
|
||||
func()
|
||||
return time.time() - start_time
|
||||
|
||||
def _check_database(self):
|
||||
"""Check database connectivity."""
|
||||
with connection.cursor() as cursor:
|
||||
cursor.execute("SELECT 1")
|
||||
cursor.fetchone()
|
||||
|
||||
def _check_cache(self):
|
||||
"""Check cache functionality."""
|
||||
from django.core.cache import cache
|
||||
cache.set('health_check', 'test', 1)
|
||||
result = cache.get('health_check')
|
||||
if result != 'test':
|
||||
raise Exception("Cache functionality failed")
|
||||
|
||||
def _check_storage(self) -> Dict[str, Any]:
|
||||
"""Check storage availability."""
|
||||
try:
|
||||
import os
|
||||
# Check media directory
|
||||
media_path = getattr(settings, 'MEDIA_ROOT', '/media')
|
||||
if not os.path.exists(media_path):
|
||||
return {'status': 'unhealthy', 'error': 'Media directory not found'}
|
||||
|
||||
# Check write permissions
|
||||
test_file = os.path.join(media_path, 'health_check_test.tmp')
|
||||
try:
|
||||
with open(test_file, 'w') as f:
|
||||
f.write('test')
|
||||
os.remove(test_file)
|
||||
except Exception as e:
|
||||
return {'status': 'unhealthy', 'error': f'Write permission error: {str(e)}'}
|
||||
|
||||
# Check disk space
|
||||
disk_usage = os.statvfs(media_path)
|
||||
free_space_percent = (disk_usage.f_bavail * disk_usage.f_frsize) / (disk_usage.f_blocks * disk_usage.f_frsize) * 100
|
||||
|
||||
if free_space_percent < 10:
|
||||
return {
|
||||
'status': 'degraded',
|
||||
'error': f'Low disk space: {free_space_percent:.1f}% free'
|
||||
}
|
||||
|
||||
return {'status': 'healthy', 'free_space_percent': free_space_percent}
|
||||
|
||||
except Exception as e:
|
||||
return {'status': 'unhealthy', 'error': str(e)}
|
||||
|
||||
def _check_external_services(self) -> Dict[str, Dict[str, Any]]:
|
||||
"""Check external services health."""
|
||||
services = {}
|
||||
|
||||
# Check email service
|
||||
try:
|
||||
from django.core.mail import get_connection
|
||||
connection = get_connection()
|
||||
connection.open()
|
||||
connection.close()
|
||||
services['email'] = {'status': 'healthy'}
|
||||
except Exception as e:
|
||||
services['email'] = {'status': 'unhealthy', 'error': str(e)}
|
||||
|
||||
# Check Redis (if configured)
|
||||
try:
|
||||
import redis
|
||||
redis_client = redis.from_url(settings.REDIS_URL)
|
||||
redis_client.ping()
|
||||
services['redis'] = {'status': 'healthy'}
|
||||
except Exception as e:
|
||||
services['redis'] = {'status': 'unhealthy', 'error': str(e)}
|
||||
|
||||
# Check external APIs (if configured)
|
||||
external_apis = getattr(settings, 'EXTERNAL_APIS', {})
|
||||
for api_name, api_config in external_apis.items():
|
||||
try:
|
||||
import requests
|
||||
response = requests.get(
|
||||
api_config['health_url'],
|
||||
timeout=api_config.get('timeout', 5)
|
||||
)
|
||||
if response.status_code == 200:
|
||||
services[api_name] = {'status': 'healthy'}
|
||||
else:
|
||||
services[api_name] = {
|
||||
'status': 'unhealthy',
|
||||
'error': f'HTTP {response.status_code}'
|
||||
}
|
||||
except Exception as e:
|
||||
services[api_name] = {'status': 'unhealthy', 'error': str(e)}
|
||||
|
||||
return services
|
||||
|
||||
def _check_malaysian_services(self) -> Dict[str, Dict[str, Any]]:
|
||||
"""Check Malaysian-specific services."""
|
||||
services = {}
|
||||
|
||||
# Check Malaysian postcode service
|
||||
try:
|
||||
from core.services.malaysian_services import MalaysianPostcodeService
|
||||
postcode_service = MalaysianPostcodeService()
|
||||
result = postcode_service.lookup_postcode('50000')
|
||||
services['postcode_service'] = {
|
||||
'status': 'healthy' if result else 'degraded'
|
||||
}
|
||||
except Exception as e:
|
||||
services['postcode_service'] = {'status': 'unhealthy', 'error': str(e)}
|
||||
|
||||
# Check SST calculation service
|
||||
try:
|
||||
from core.services.malaysian_services import SSTCalculationService
|
||||
sst_service = SSTCalculationService()
|
||||
result = sst_service.calculate_sst(100, 'standard', 'Johor')
|
||||
services['sst_service'] = {
|
||||
'status': 'healthy' if result is not None else 'degraded'
|
||||
}
|
||||
except Exception as e:
|
||||
services['sst_service'] = {'status': 'unhealthy', 'error': str(e)}
|
||||
|
||||
# Check IC validation service
|
||||
try:
|
||||
from core.services.malaysian_services import ICValidationService
|
||||
ic_service = ICValidationService()
|
||||
result = ic_service.validate_ic('1234567890')
|
||||
services['ic_validation_service'] = {
|
||||
'status': 'healthy' if result is not None else 'degraded'
|
||||
}
|
||||
except Exception as e:
|
||||
services['ic_validation_service'] = {'status': 'unhealthy', 'error': str(e)}
|
||||
|
||||
return services
|
||||
|
||||
class AlertsView(APIView):
|
||||
"""Alerts management endpoint."""
|
||||
|
||||
def get(self, request):
|
||||
"""Get alerts."""
|
||||
try:
|
||||
# Check for new alerts
|
||||
alert_manager.check_rules()
|
||||
|
||||
# Get query parameters
|
||||
severity = request.query_params.get('severity')
|
||||
category = request.query_params.get('category')
|
||||
status = request.query_params.get('status', 'active')
|
||||
hours = int(request.query_params.get('hours', 24))
|
||||
|
||||
# Get alerts
|
||||
if status == 'active':
|
||||
alerts = alert_manager.get_active_alerts(severity=severity, category=category)
|
||||
else:
|
||||
alerts = alert_manager.get_alert_history(hours=hours)
|
||||
if severity:
|
||||
alerts = [a for a in alerts if a.severity == severity]
|
||||
if category:
|
||||
alerts = [a for a in alerts if a.category == category]
|
||||
|
||||
# Convert to response format
|
||||
response_data = {
|
||||
'alerts': [alert.to_dict() for alert in alerts],
|
||||
'summary': self._get_alerts_summary(alerts),
|
||||
'timestamp': timezone.now().isoformat()
|
||||
}
|
||||
|
||||
return Response(response_data)
|
||||
|
||||
except Exception as e:
|
||||
return Response(
|
||||
{'error': f'Failed to get alerts: {str(e)}'},
|
||||
status=status.HTTP_500_INTERNAL_SERVER_ERROR
|
||||
)
|
||||
|
||||
def post(self, request):
|
||||
"""Create manual alert."""
|
||||
try:
|
||||
data = request.data
|
||||
|
||||
alert = Alert(
|
||||
title=data['title'],
|
||||
description=data['description'],
|
||||
severity=data.get('severity', AlertSeverity.INFO),
|
||||
category=data.get('category', AlertCategory.SYSTEM),
|
||||
metadata=data.get('metadata', {})
|
||||
)
|
||||
|
||||
alert_manager.trigger_alert(alert)
|
||||
|
||||
return Response(
|
||||
{'message': 'Alert created successfully', 'alert_id': alert.id},
|
||||
status=status.HTTP_201_CREATED
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
return Response(
|
||||
{'error': f'Failed to create alert: {str(e)}'},
|
||||
status=status.HTTP_500_INTERNAL_SERVER_ERROR
|
||||
)
|
||||
|
||||
def _get_alerts_summary(self, alerts) -> Dict[str, Any]:
|
||||
"""Get alerts summary statistics."""
|
||||
summary = {
|
||||
'total': len(alerts),
|
||||
'by_severity': {},
|
||||
'by_category': {},
|
||||
'by_status': {}
|
||||
}
|
||||
|
||||
for alert in alerts:
|
||||
# Count by severity
|
||||
summary['by_severity'][alert.severity] = summary['by_severity'].get(alert.severity, 0) + 1
|
||||
|
||||
# Count by category
|
||||
summary['by_category'][alert.category] = summary['by_category'].get(alert.category, 0) + 1
|
||||
|
||||
# Count by status
|
||||
status = alert.get_status()
|
||||
summary['by_status'][status] = summary['by_status'].get(status, 0) + 1
|
||||
|
||||
return summary
|
||||
|
||||
class AlertActionView(APIView):
|
||||
"""Alert management actions."""
|
||||
|
||||
def post(self, request, alert_id: str, action: str):
|
||||
"""Perform alert actions."""
|
||||
try:
|
||||
if action == 'acknowledge':
|
||||
user = request.user.username if request.user.is_authenticated else 'api_user'
|
||||
alert_manager.acknowledge_alert(alert_id, user)
|
||||
return Response({'message': f'Alert {alert_id} acknowledged'})
|
||||
|
||||
elif action == 'resolve':
|
||||
user = request.user.username if request.user.is_authenticated else 'api_user'
|
||||
alert_manager.resolve_alert(alert_id, user)
|
||||
return Response({'message': f'Alert {alert_id} resolved'})
|
||||
|
||||
else:
|
||||
return Response(
|
||||
{'error': f'Unknown action: {action}'},
|
||||
status=status.HTTP_400_BAD_REQUEST
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
return Response(
|
||||
{'error': f'Failed to perform action {action} on alert {alert_id}: {str(e)}'},
|
||||
status=status.HTTP_500_INTERNAL_SERVER_ERROR
|
||||
)
|
||||
|
||||
class MonitoringDashboardView(LoginRequiredMixin, TemplateView):
|
||||
"""Monitoring dashboard template view."""
|
||||
|
||||
template_name = 'monitoring/dashboard.html'
|
||||
|
||||
def get_context_data(self, **kwargs):
|
||||
"""Get dashboard context data."""
|
||||
context = super().get_context_data(**kwargs)
|
||||
|
||||
# Get current alerts
|
||||
context['active_alerts'] = alert_manager.get_active_alerts()
|
||||
context['alert_summary'] = self._get_alerts_summary(context['active_alerts'])
|
||||
|
||||
# Get system metrics
|
||||
context['system_metrics'] = self._get_system_metrics()
|
||||
|
||||
# Get business metrics
|
||||
context['business_metrics'] = self._get_business_metrics()
|
||||
|
||||
# Malaysian-specific metrics
|
||||
context['malaysian_metrics'] = self._get_malaysian_metrics()
|
||||
|
||||
return context
|
||||
|
||||
def _get_alerts_summary(self, alerts) -> Dict[str, Any]:
|
||||
"""Get alerts summary."""
|
||||
summary = {
|
||||
'total': len(alerts),
|
||||
'critical': len([a for a in alerts if a.severity == AlertSeverity.CRITICAL]),
|
||||
'error': len([a for a in alerts if a.severity == AlertSeverity.ERROR]),
|
||||
'warning': len([a for a in alerts if a.severity == AlertSeverity.WARNING]),
|
||||
'info': len([a for a in alerts if a.severity == AlertSeverity.INFO]),
|
||||
}
|
||||
return summary
|
||||
|
||||
def _get_system_metrics(self) -> Dict[str, Any]:
|
||||
"""Get system metrics."""
|
||||
try:
|
||||
import psutil
|
||||
|
||||
return {
|
||||
'cpu_usage': psutil.cpu_percent(interval=1),
|
||||
'memory_usage': psutil.virtual_memory().percent,
|
||||
'disk_usage': psutil.disk_usage('/').percent,
|
||||
'load_average': psutil.getloadavg()[0],
|
||||
'uptime': datetime.now() - datetime.fromtimestamp(psutil.boot_time()),
|
||||
}
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
def _get_business_metrics(self) -> Dict[str, Any]:
|
||||
"""Get business metrics."""
|
||||
try:
|
||||
from django.contrib.auth import get_user_model
|
||||
from core.models import Transaction
|
||||
|
||||
User = get_user_model()
|
||||
|
||||
# Active users
|
||||
active_users = User.objects.filter(
|
||||
is_active=True,
|
||||
last_login__gte=timezone.now() - timedelta(minutes=30)
|
||||
).count()
|
||||
|
||||
# Today's transactions
|
||||
today_transactions = Transaction.objects.filter(
|
||||
created_at__date=timezone.now().date(),
|
||||
status='completed'
|
||||
).count()
|
||||
|
||||
return {
|
||||
'active_users': active_users,
|
||||
'today_transactions': today_transactions,
|
||||
}
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
def _get_malaysian_metrics(self) -> Dict[str, Any]:
|
||||
"""Get Malaysian-specific metrics."""
|
||||
try:
|
||||
from core.models import MalaysianICValidation, SSTCalculation
|
||||
|
||||
return {
|
||||
'ic_validations_today': MalaysianICValidation.objects.filter(
|
||||
created_at__date=timezone.now().date()
|
||||
).count(),
|
||||
'sst_calculations_today': SSTCalculation.objects.filter(
|
||||
created_at__date=timezone.now().date()
|
||||
).count(),
|
||||
}
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
class MetricsDashboardView(LoginRequiredMixin, TemplateView):
|
||||
"""Metrics dashboard template view."""
|
||||
|
||||
template_name = 'monitoring/metrics_dashboard.html'
|
||||
Reference in New Issue
Block a user