Files
multitenetsaas/backend/monitoring/alerts.py
AHMET YILMAZ b3fff546e9
Some checks failed
System Monitoring / Health Checks (push) Has been cancelled
System Monitoring / Performance Monitoring (push) Has been cancelled
System Monitoring / Database Monitoring (push) Has been cancelled
System Monitoring / Cache Monitoring (push) Has been cancelled
System Monitoring / Log Monitoring (push) Has been cancelled
System Monitoring / Resource Monitoring (push) Has been cancelled
System Monitoring / Uptime Monitoring (push) Has been cancelled
System Monitoring / Backup Monitoring (push) Has been cancelled
System Monitoring / Security Monitoring (push) Has been cancelled
System Monitoring / Monitoring Dashboard (push) Has been cancelled
System Monitoring / Alerting (push) Has been cancelled
Security Scanning / Dependency Scanning (push) Has been cancelled
Security Scanning / Code Security Scanning (push) Has been cancelled
Security Scanning / Secrets Scanning (push) Has been cancelled
Security Scanning / Container Security Scanning (push) Has been cancelled
Security Scanning / Compliance Checking (push) Has been cancelled
Security Scanning / Security Dashboard (push) Has been cancelled
Security Scanning / Security Remediation (push) Has been cancelled
project initialization
2025-10-05 02:37:33 +08:00

584 lines
20 KiB
Python

"""
Alert management system for the Malaysian SME SaaS platform.
Provides comprehensive alerting with Malaysian context.
"""
import json
import logging
import smtplib
import requests
from datetime import datetime, timedelta
from typing import Dict, List, Any, Optional, Callable
from email.mime.text import MimeText
from email.mime.multipart import MimeMultipart
from django.conf import settings
from django.core.mail import send_mail
from django.utils import timezone
from django.db import connection
from prometheus_client import Counter, Gauge
import redis
logger = logging.getLogger(__name__)
# Alert metrics
ALERTS_TOTAL = Counter('alerts_total', 'Total alerts generated', ['severity', 'category'])
ALERTS_RESOLVED = Counter('alerts_resolved_total', 'Total alerts resolved', ['severity', 'category'])
ALERTS_ACTIVE = Gauge('alerts_active', 'Currently active alerts', ['severity', 'category'])
class AlertSeverity:
"""Alert severity levels."""
INFO = 'info'
WARNING = 'warning'
ERROR = 'error'
CRITICAL = 'critical'
class AlertCategory:
"""Alert categories."""
SYSTEM = 'system'
APPLICATION = 'application'
DATABASE = 'database'
CACHE = 'cache'
SECURITY = 'security'
BUSINESS = 'business'
MALAYSIAN = 'malaysian'
class Alert:
"""Single alert instance."""
def __init__(
self,
title: str,
description: str,
severity: str,
category: str,
metadata: Optional[Dict[str, Any]] = None
):
self.id = f"{int(timezone.now().timestamp())}-{hash(title)}"
self.title = title
self.description = description
self.severity = severity
self.category = category
self.metadata = metadata or {}
self.created_at = timezone.now()
self.resolved_at = None
self.acknowledged_at = None
self.acknowledged_by = None
self.tags = self.metadata.get('tags', [])
self.source = self.metadata.get('source', 'system')
self.tenant = self.metadata.get('tenant', 'all')
def to_dict(self) -> Dict[str, Any]:
"""Convert alert to dictionary."""
return {
'id': self.id,
'title': self.title,
'description': self.description,
'severity': self.severity,
'category': self.category,
'metadata': self.metadata,
'created_at': self.created_at.isoformat(),
'resolved_at': self.resolved_at.isoformat() if self.resolved_at else None,
'acknowledged_at': self.acknowledged_at.isoformat() if self.acknowledged_at else None,
'acknowledged_by': self.acknowledged_by,
'tags': self.tags,
'source': self.source,
'tenant': self.tenant,
'status': self.get_status(),
}
def get_status(self) -> str:
"""Get alert status."""
if self.resolved_at:
return 'resolved'
elif self.acknowledged_at:
return 'acknowledged'
else:
return 'active'
def acknowledge(self, user: str):
"""Acknowledge alert."""
self.acknowledged_at = timezone.now()
self.acknowledged_by = user
logger.info(f"Alert {self.id} acknowledged by {user}")
def resolve(self, user: Optional[str] = None):
"""Resolve alert."""
self.resolved_at = timezone.now()
if user:
self.acknowledged_by = user
logger.info(f"Alert {self.id} resolved by {user or 'system'}")
# Update metrics
ALERTS_RESOLVED.labels(
severity=self.severity,
category=self.category
).inc()
class AlertRule:
"""Alert rule definition."""
def __init__(
self,
name: str,
condition: Callable[[], bool],
title_template: str,
description_template: str,
severity: str,
category: str,
cooldown_minutes: int = 15,
enabled: bool = True
):
self.name = name
self.condition = condition
self.title_template = title_template
self.description_template = description_template
self.severity = severity
self.category = category
self.cooldown_minutes = cooldown_minutes
self.enabled = enabled
self.last_triggered = None
self.metadata = {}
def should_trigger(self) -> bool:
"""Check if rule should trigger alert."""
if not self.enabled:
return False
# Check cooldown
if self.last_triggered:
cooldown_until = self.last_triggered + timedelta(minutes=self.cooldown_minutes)
if timezone.now() < cooldown_until:
return False
# Check condition
try:
return self.condition()
except Exception as e:
logger.error(f"Error checking alert rule {self.name}: {e}")
return False
def trigger(self, metadata: Optional[Dict[str, Any]] = None) -> Alert:
"""Trigger alert from rule."""
self.last_triggered = timezone.now()
self.metadata = metadata or {}
# Format title and description
title = self.title_template.format(**self.metadata)
description = self.description_template.format(**self.metadata)
return Alert(
title=title,
description=description,
severity=self.severity,
category=self.category,
metadata={**self.metadata, 'rule_name': self.name}
)
class AlertManager:
"""Main alert management system."""
def __init__(self):
self.rules: List[AlertRule] = []
self.active_alerts: Dict[str, Alert] = {}
self.alert_history: List[Alert] = []
self.notifiers = []
self.redis_client = None
self.initialize_redis()
self.setup_default_rules()
self.setup_notifiers()
def initialize_redis(self):
"""Initialize Redis connection for alert persistence."""
try:
self.redis_client = redis.from_url(settings.REDIS_URL)
except Exception as e:
logger.warning(f"Failed to initialize Redis for alerts: {e}")
def setup_default_rules(self):
"""Setup default alert rules."""
# System alerts
self.add_rule(AlertRule(
name='high_cpu_usage',
condition=self._check_high_cpu_usage,
title_template='High CPU Usage Detected',
description_template='CPU usage is {cpu_usage}% on server {server}',
severity=AlertSeverity.WARNING,
category=AlertCategory.SYSTEM,
cooldown_minutes=10
))
self.add_rule(AlertRule(
name='critical_cpu_usage',
condition=self._check_critical_cpu_usage,
title_template='Critical CPU Usage',
description_template='CPU usage is {cpu_usage}% on server {server} - immediate attention required',
severity=AlertSeverity.CRITICAL,
category=AlertCategory.SYSTEM,
cooldown_minutes=5
))
# Database alerts
self.add_rule(AlertRule(
name='database_connection_errors',
condition=self._check_database_connection_errors,
title_template='Database Connection Errors',
description_template='Database connection errors detected: {error_count} errors in the last 5 minutes',
severity=AlertSeverity.ERROR,
category=AlertCategory.DATABASE,
cooldown_minutes=5
))
# Application alerts
self.add_rule(AlertRule(
name='high_error_rate',
condition=self._check_high_error_rate,
title_template='High Application Error Rate',
description_template='Application error rate is {error_rate}% (threshold: 5%)',
severity=AlertSeverity.WARNING,
category=AlertCategory.APPLICATION,
cooldown_minutes=15
))
# Business alerts
self.add_rule(AlertRule(
name='low_active_users',
condition=self._check_low_active_users,
title_template='Low Active Users',
description_template='Only {active_users} active users detected (threshold: {threshold})',
severity=AlertSeverity.INFO,
category=AlertCategory.BUSINESS,
cooldown_minutes=60
))
# Malaysian-specific alerts
self.add_rule(AlertRule(
name='malaysian_service_degradation',
condition=self._check_malaysian_service_degradation,
title_template='Malaysian Service Degradation',
description_template='Malaysian service availability is {availability}% (threshold: 99%)',
severity=AlertSeverity.WARNING,
category=AlertCategory.MALAYSIAN,
cooldown_minutes=10
))
# Security alerts
self.add_rule(AlertRule(
name='suspicious_login_activity',
condition=self._check_suspicious_login_activity,
title_template='Suspicious Login Activity',
description_template='Detected {failed_logins} failed login attempts from IP {ip_address}',
severity=AlertSeverity.WARNING,
category=AlertCategory.SECURITY,
cooldown_minutes=15
))
def setup_notifiers(self):
"""Setup notification channels."""
# Email notifier
if settings.EMAIL_HOST:
self.add_notifier(EmailNotifier())
# Slack notifier
if hasattr(settings, 'SLACK_WEBHOOK_URL'):
self.add_notifier(SlackNotifier())
# SMS notifier for critical alerts (Malaysian numbers)
if hasattr(settings, 'SMS_API_KEY'):
self.add_notifier(SMSNotifier())
def add_rule(self, rule: AlertRule):
"""Add alert rule."""
self.rules.append(rule)
logger.info(f"Added alert rule: {rule.name}")
def add_notifier(self, notifier):
"""Add notification channel."""
self.notifiers.append(notifier)
logger.info(f"Added notifier: {notifier.__class__.__name__}")
def check_rules(self):
"""Check all alert rules and trigger if needed."""
for rule in self.rules:
try:
if rule.should_trigger():
alert = rule.trigger()
self.trigger_alert(alert)
except Exception as e:
logger.error(f"Error checking rule {rule.name}: {e}")
def trigger_alert(self, alert: Alert):
"""Trigger new alert."""
# Check if similar active alert exists
for existing_alert in self.active_alerts.values():
if (existing_alert.title == alert.title and
existing_alert.severity == alert.severity and
existing_alert.get_status() == 'active'):
logger.debug(f"Similar alert already active: {existing_alert.id}")
return
# Add alert
self.active_alerts[alert.id] = alert
self.alert_history.append(alert)
# Update metrics
ALERTS_TOTAL.labels(
severity=alert.severity,
category=alert.category
).inc()
# Keep only recent history
if len(self.alert_history) > 1000:
self.alert_history = self.alert_history[-1000:]
# Store in Redis
if self.redis_client:
try:
self.redis_client.setex(
f"alert:{alert.id}",
86400, # 24 hours
json.dumps(alert.to_dict())
)
except Exception as e:
logger.error(f"Failed to store alert in Redis: {e}")
# Send notifications
self.send_notifications(alert)
logger.warning(f"Alert triggered: {alert.title} ({alert.severity})")
def resolve_alert(self, alert_id: str, user: Optional[str] = None):
"""Resolve alert."""
if alert_id in self.active_alerts:
alert = self.active_alerts[alert_id]
alert.resolve(user)
del self.active_alerts[alert_id]
# Update Redis
if self.redis_client:
try:
self.redis_client.delete(f"alert:{alert_id}")
except Exception as e:
logger.error(f"Failed to delete alert from Redis: {e}")
logger.info(f"Alert resolved: {alert.title}")
def acknowledge_alert(self, alert_id: str, user: str):
"""Acknowledge alert."""
if alert_id in self.active_alerts:
alert = self.active_alerts[alert_id]
alert.acknowledge(user)
logger.info(f"Alert acknowledged: {alert.title} by {user}")
def get_active_alerts(self, severity: Optional[str] = None, category: Optional[str] = None) -> List[Alert]:
"""Get active alerts with optional filtering."""
alerts = list(self.active_alerts.values())
if severity:
alerts = [a for a in alerts if a.severity == severity]
if category:
alerts = [a for a in alerts if a.category == category]
return alerts
def get_alert_history(self, hours: int = 24) -> List[Alert]:
"""Get alert history for specified hours."""
since = timezone.now() - timedelta(hours=hours)
return [a for a in self.alert_history if a.created_at >= since]
def send_notifications(self, alert: Alert):
"""Send alert notifications."""
for notifier in self.notifiers:
try:
if notifier.should_notify(alert):
notifier.send(alert)
except Exception as e:
logger.error(f"Error sending notification via {notifier.__class__.__name__}: {e}")
# Alert condition methods
def _check_high_cpu_usage(self) -> bool:
"""Check for high CPU usage."""
try:
import psutil
cpu_usage = psutil.cpu_percent(interval=1)
return cpu_usage > 80
except Exception:
return False
def _check_critical_cpu_usage(self) -> bool:
"""Check for critical CPU usage."""
try:
import psutil
cpu_usage = psutil.cpu_percent(interval=1)
return cpu_usage > 90
except Exception:
return False
def _check_database_connection_errors(self) -> bool:
"""Check for database connection errors."""
try:
# This would integrate with your error tracking system
# For now, return False as placeholder
return False
except Exception:
return False
def _check_high_error_rate(self) -> bool:
"""Check for high application error rate."""
try:
# This would check application error rates
# For now, return False as placeholder
return False
except Exception:
return False
def _check_low_active_users(self) -> bool:
"""Check for low active users."""
try:
from django.contrib.auth import get_user_model
User = get_user_model()
five_minutes_ago = timezone.now() - timedelta(minutes=5)
active_count = User.objects.filter(
last_login__gte=five_minutes_ago,
is_active=True
).count()
return active_count < 10
except Exception:
return False
def _check_malaysian_service_degradation(self) -> bool:
"""Check for Malaysian service degradation."""
try:
# This would check Malaysian-specific service health
# For now, return False as placeholder
return False
except Exception:
return False
def _check_suspicious_login_activity(self) -> bool:
"""Check for suspicious login activity."""
try:
# This would check for suspicious login patterns
# For now, return False as placeholder
return False
except Exception:
return False
class EmailNotifier:
"""Email notification system."""
def should_notify(self, alert: Alert) -> bool:
"""Check if should send email notification."""
# Send emails for warnings and above
return alert.severity in [AlertSeverity.WARNING, AlertSeverity.ERROR, AlertSeverity.CRITICAL]
def send(self, alert: Alert):
"""Send email notification."""
try:
subject = f"[{alert.severity.upper()}] {alert.title}"
message = f"""
Alert Details:
- Title: {alert.title}
- Severity: {alert.severity}
- Category: {alert.category}
- Description: {alert.description}
- Time: {alert.created_at}
- Source: {alert.source}
- Tenant: {alert.tenant}
Additional Information:
{json.dumps(alert.metadata, indent=2)}
"""
send_mail(
subject,
message,
settings.DEFAULT_FROM_EMAIL,
settings.ALERT_EMAIL_RECIPIENTS,
fail_silently=False
)
logger.info(f"Email notification sent for alert: {alert.id}")
except Exception as e:
logger.error(f"Failed to send email notification: {e}")
class SlackNotifier:
"""Slack notification system."""
def should_notify(self, alert: Alert) -> bool:
"""Check if should send Slack notification."""
# Send Slack for all alerts
return True
def send(self, alert: Alert):
"""Send Slack notification."""
try:
webhook_url = settings.SLACK_WEBHOOK_URL
# Color based on severity
colors = {
AlertSeverity.INFO: '#36a64f',
AlertSeverity.WARNING: '#ff9500',
AlertSeverity.ERROR: '#ff0000',
AlertSeverity.CRITICAL: '#990000'
}
payload = {
'text': f'{alert.severity.upper()}: {alert.title}',
'attachments': [{
'color': colors.get(alert.severity, '#36a64f'),
'title': alert.title,
'text': alert.description,
'fields': [
{'title': 'Severity', 'value': alert.severity, 'short': True},
{'title': 'Category', 'value': alert.category, 'short': True},
{'title': 'Time', 'value': alert.created_at.strftime('%Y-%m-%d %H:%M:%S'), 'short': True},
{'title': 'Tenant', 'value': alert.tenant, 'short': True},
],
'footer': 'Malaysian SME Platform Alert System',
'ts': int(alert.created_at.timestamp())
}]
}
response = requests.post(webhook_url, json=payload, timeout=10)
response.raise_for_status()
logger.info(f"Slack notification sent for alert: {alert.id}")
except Exception as e:
logger.error(f"Failed to send Slack notification: {e}")
class SMSNotifier:
"""SMS notification system for critical alerts."""
def should_notify(self, alert: Alert) -> bool:
"""Check if should send SMS notification."""
# Only send SMS for critical alerts
return alert.severity == AlertSeverity.CRITICAL
def send(self, alert: Alert):
"""Send SMS notification."""
try:
# This would integrate with Malaysian SMS service
# For now, just log the attempt
logger.info(f"SMS notification would be sent for critical alert: {alert.id}")
# Example integration with Malaysian SMS service
# sms_api_url = settings.SMS_API_URL
# api_key = settings.SMS_API_KEY
# recipients = settings.CRITICAL_ALERT_SMS_RECIPIENTS
# message = f"CRITICAL: {alert.title}. {alert.description[:100]}"
# payload = {
# 'api_key': api_key,
# 'recipients': recipients,
# 'message': message
# }
# response = requests.post(sms_api_url, json=payload, timeout=10)
# response.raise_for_status()
except Exception as e:
logger.error(f"Failed to send SMS notification: {e}")
# Global alert manager instance
alert_manager = AlertManager()