""" Alert management system for the Malaysian SME SaaS platform. Provides comprehensive alerting with Malaysian context. """ import json import logging import smtplib import requests from datetime import datetime, timedelta from typing import Dict, List, Any, Optional, Callable from email.mime.text import MimeText from email.mime.multipart import MimeMultipart from django.conf import settings from django.core.mail import send_mail from django.utils import timezone from django.db import connection from prometheus_client import Counter, Gauge import redis logger = logging.getLogger(__name__) # Alert metrics ALERTS_TOTAL = Counter('alerts_total', 'Total alerts generated', ['severity', 'category']) ALERTS_RESOLVED = Counter('alerts_resolved_total', 'Total alerts resolved', ['severity', 'category']) ALERTS_ACTIVE = Gauge('alerts_active', 'Currently active alerts', ['severity', 'category']) class AlertSeverity: """Alert severity levels.""" INFO = 'info' WARNING = 'warning' ERROR = 'error' CRITICAL = 'critical' class AlertCategory: """Alert categories.""" SYSTEM = 'system' APPLICATION = 'application' DATABASE = 'database' CACHE = 'cache' SECURITY = 'security' BUSINESS = 'business' MALAYSIAN = 'malaysian' class Alert: """Single alert instance.""" def __init__( self, title: str, description: str, severity: str, category: str, metadata: Optional[Dict[str, Any]] = None ): self.id = f"{int(timezone.now().timestamp())}-{hash(title)}" self.title = title self.description = description self.severity = severity self.category = category self.metadata = metadata or {} self.created_at = timezone.now() self.resolved_at = None self.acknowledged_at = None self.acknowledged_by = None self.tags = self.metadata.get('tags', []) self.source = self.metadata.get('source', 'system') self.tenant = self.metadata.get('tenant', 'all') def to_dict(self) -> Dict[str, Any]: """Convert alert to dictionary.""" return { 'id': self.id, 'title': self.title, 'description': self.description, 'severity': self.severity, 'category': self.category, 'metadata': self.metadata, 'created_at': self.created_at.isoformat(), 'resolved_at': self.resolved_at.isoformat() if self.resolved_at else None, 'acknowledged_at': self.acknowledged_at.isoformat() if self.acknowledged_at else None, 'acknowledged_by': self.acknowledged_by, 'tags': self.tags, 'source': self.source, 'tenant': self.tenant, 'status': self.get_status(), } def get_status(self) -> str: """Get alert status.""" if self.resolved_at: return 'resolved' elif self.acknowledged_at: return 'acknowledged' else: return 'active' def acknowledge(self, user: str): """Acknowledge alert.""" self.acknowledged_at = timezone.now() self.acknowledged_by = user logger.info(f"Alert {self.id} acknowledged by {user}") def resolve(self, user: Optional[str] = None): """Resolve alert.""" self.resolved_at = timezone.now() if user: self.acknowledged_by = user logger.info(f"Alert {self.id} resolved by {user or 'system'}") # Update metrics ALERTS_RESOLVED.labels( severity=self.severity, category=self.category ).inc() class AlertRule: """Alert rule definition.""" def __init__( self, name: str, condition: Callable[[], bool], title_template: str, description_template: str, severity: str, category: str, cooldown_minutes: int = 15, enabled: bool = True ): self.name = name self.condition = condition self.title_template = title_template self.description_template = description_template self.severity = severity self.category = category self.cooldown_minutes = cooldown_minutes self.enabled = enabled self.last_triggered = None self.metadata = {} def should_trigger(self) -> bool: """Check if rule should trigger alert.""" if not self.enabled: return False # Check cooldown if self.last_triggered: cooldown_until = self.last_triggered + timedelta(minutes=self.cooldown_minutes) if timezone.now() < cooldown_until: return False # Check condition try: return self.condition() except Exception as e: logger.error(f"Error checking alert rule {self.name}: {e}") return False def trigger(self, metadata: Optional[Dict[str, Any]] = None) -> Alert: """Trigger alert from rule.""" self.last_triggered = timezone.now() self.metadata = metadata or {} # Format title and description title = self.title_template.format(**self.metadata) description = self.description_template.format(**self.metadata) return Alert( title=title, description=description, severity=self.severity, category=self.category, metadata={**self.metadata, 'rule_name': self.name} ) class AlertManager: """Main alert management system.""" def __init__(self): self.rules: List[AlertRule] = [] self.active_alerts: Dict[str, Alert] = {} self.alert_history: List[Alert] = [] self.notifiers = [] self.redis_client = None self.initialize_redis() self.setup_default_rules() self.setup_notifiers() def initialize_redis(self): """Initialize Redis connection for alert persistence.""" try: self.redis_client = redis.from_url(settings.REDIS_URL) except Exception as e: logger.warning(f"Failed to initialize Redis for alerts: {e}") def setup_default_rules(self): """Setup default alert rules.""" # System alerts self.add_rule(AlertRule( name='high_cpu_usage', condition=self._check_high_cpu_usage, title_template='High CPU Usage Detected', description_template='CPU usage is {cpu_usage}% on server {server}', severity=AlertSeverity.WARNING, category=AlertCategory.SYSTEM, cooldown_minutes=10 )) self.add_rule(AlertRule( name='critical_cpu_usage', condition=self._check_critical_cpu_usage, title_template='Critical CPU Usage', description_template='CPU usage is {cpu_usage}% on server {server} - immediate attention required', severity=AlertSeverity.CRITICAL, category=AlertCategory.SYSTEM, cooldown_minutes=5 )) # Database alerts self.add_rule(AlertRule( name='database_connection_errors', condition=self._check_database_connection_errors, title_template='Database Connection Errors', description_template='Database connection errors detected: {error_count} errors in the last 5 minutes', severity=AlertSeverity.ERROR, category=AlertCategory.DATABASE, cooldown_minutes=5 )) # Application alerts self.add_rule(AlertRule( name='high_error_rate', condition=self._check_high_error_rate, title_template='High Application Error Rate', description_template='Application error rate is {error_rate}% (threshold: 5%)', severity=AlertSeverity.WARNING, category=AlertCategory.APPLICATION, cooldown_minutes=15 )) # Business alerts self.add_rule(AlertRule( name='low_active_users', condition=self._check_low_active_users, title_template='Low Active Users', description_template='Only {active_users} active users detected (threshold: {threshold})', severity=AlertSeverity.INFO, category=AlertCategory.BUSINESS, cooldown_minutes=60 )) # Malaysian-specific alerts self.add_rule(AlertRule( name='malaysian_service_degradation', condition=self._check_malaysian_service_degradation, title_template='Malaysian Service Degradation', description_template='Malaysian service availability is {availability}% (threshold: 99%)', severity=AlertSeverity.WARNING, category=AlertCategory.MALAYSIAN, cooldown_minutes=10 )) # Security alerts self.add_rule(AlertRule( name='suspicious_login_activity', condition=self._check_suspicious_login_activity, title_template='Suspicious Login Activity', description_template='Detected {failed_logins} failed login attempts from IP {ip_address}', severity=AlertSeverity.WARNING, category=AlertCategory.SECURITY, cooldown_minutes=15 )) def setup_notifiers(self): """Setup notification channels.""" # Email notifier if settings.EMAIL_HOST: self.add_notifier(EmailNotifier()) # Slack notifier if hasattr(settings, 'SLACK_WEBHOOK_URL'): self.add_notifier(SlackNotifier()) # SMS notifier for critical alerts (Malaysian numbers) if hasattr(settings, 'SMS_API_KEY'): self.add_notifier(SMSNotifier()) def add_rule(self, rule: AlertRule): """Add alert rule.""" self.rules.append(rule) logger.info(f"Added alert rule: {rule.name}") def add_notifier(self, notifier): """Add notification channel.""" self.notifiers.append(notifier) logger.info(f"Added notifier: {notifier.__class__.__name__}") def check_rules(self): """Check all alert rules and trigger if needed.""" for rule in self.rules: try: if rule.should_trigger(): alert = rule.trigger() self.trigger_alert(alert) except Exception as e: logger.error(f"Error checking rule {rule.name}: {e}") def trigger_alert(self, alert: Alert): """Trigger new alert.""" # Check if similar active alert exists for existing_alert in self.active_alerts.values(): if (existing_alert.title == alert.title and existing_alert.severity == alert.severity and existing_alert.get_status() == 'active'): logger.debug(f"Similar alert already active: {existing_alert.id}") return # Add alert self.active_alerts[alert.id] = alert self.alert_history.append(alert) # Update metrics ALERTS_TOTAL.labels( severity=alert.severity, category=alert.category ).inc() # Keep only recent history if len(self.alert_history) > 1000: self.alert_history = self.alert_history[-1000:] # Store in Redis if self.redis_client: try: self.redis_client.setex( f"alert:{alert.id}", 86400, # 24 hours json.dumps(alert.to_dict()) ) except Exception as e: logger.error(f"Failed to store alert in Redis: {e}") # Send notifications self.send_notifications(alert) logger.warning(f"Alert triggered: {alert.title} ({alert.severity})") def resolve_alert(self, alert_id: str, user: Optional[str] = None): """Resolve alert.""" if alert_id in self.active_alerts: alert = self.active_alerts[alert_id] alert.resolve(user) del self.active_alerts[alert_id] # Update Redis if self.redis_client: try: self.redis_client.delete(f"alert:{alert_id}") except Exception as e: logger.error(f"Failed to delete alert from Redis: {e}") logger.info(f"Alert resolved: {alert.title}") def acknowledge_alert(self, alert_id: str, user: str): """Acknowledge alert.""" if alert_id in self.active_alerts: alert = self.active_alerts[alert_id] alert.acknowledge(user) logger.info(f"Alert acknowledged: {alert.title} by {user}") def get_active_alerts(self, severity: Optional[str] = None, category: Optional[str] = None) -> List[Alert]: """Get active alerts with optional filtering.""" alerts = list(self.active_alerts.values()) if severity: alerts = [a for a in alerts if a.severity == severity] if category: alerts = [a for a in alerts if a.category == category] return alerts def get_alert_history(self, hours: int = 24) -> List[Alert]: """Get alert history for specified hours.""" since = timezone.now() - timedelta(hours=hours) return [a for a in self.alert_history if a.created_at >= since] def send_notifications(self, alert: Alert): """Send alert notifications.""" for notifier in self.notifiers: try: if notifier.should_notify(alert): notifier.send(alert) except Exception as e: logger.error(f"Error sending notification via {notifier.__class__.__name__}: {e}") # Alert condition methods def _check_high_cpu_usage(self) -> bool: """Check for high CPU usage.""" try: import psutil cpu_usage = psutil.cpu_percent(interval=1) return cpu_usage > 80 except Exception: return False def _check_critical_cpu_usage(self) -> bool: """Check for critical CPU usage.""" try: import psutil cpu_usage = psutil.cpu_percent(interval=1) return cpu_usage > 90 except Exception: return False def _check_database_connection_errors(self) -> bool: """Check for database connection errors.""" try: # This would integrate with your error tracking system # For now, return False as placeholder return False except Exception: return False def _check_high_error_rate(self) -> bool: """Check for high application error rate.""" try: # This would check application error rates # For now, return False as placeholder return False except Exception: return False def _check_low_active_users(self) -> bool: """Check for low active users.""" try: from django.contrib.auth import get_user_model User = get_user_model() five_minutes_ago = timezone.now() - timedelta(minutes=5) active_count = User.objects.filter( last_login__gte=five_minutes_ago, is_active=True ).count() return active_count < 10 except Exception: return False def _check_malaysian_service_degradation(self) -> bool: """Check for Malaysian service degradation.""" try: # This would check Malaysian-specific service health # For now, return False as placeholder return False except Exception: return False def _check_suspicious_login_activity(self) -> bool: """Check for suspicious login activity.""" try: # This would check for suspicious login patterns # For now, return False as placeholder return False except Exception: return False class EmailNotifier: """Email notification system.""" def should_notify(self, alert: Alert) -> bool: """Check if should send email notification.""" # Send emails for warnings and above return alert.severity in [AlertSeverity.WARNING, AlertSeverity.ERROR, AlertSeverity.CRITICAL] def send(self, alert: Alert): """Send email notification.""" try: subject = f"[{alert.severity.upper()}] {alert.title}" message = f""" Alert Details: - Title: {alert.title} - Severity: {alert.severity} - Category: {alert.category} - Description: {alert.description} - Time: {alert.created_at} - Source: {alert.source} - Tenant: {alert.tenant} Additional Information: {json.dumps(alert.metadata, indent=2)} """ send_mail( subject, message, settings.DEFAULT_FROM_EMAIL, settings.ALERT_EMAIL_RECIPIENTS, fail_silently=False ) logger.info(f"Email notification sent for alert: {alert.id}") except Exception as e: logger.error(f"Failed to send email notification: {e}") class SlackNotifier: """Slack notification system.""" def should_notify(self, alert: Alert) -> bool: """Check if should send Slack notification.""" # Send Slack for all alerts return True def send(self, alert: Alert): """Send Slack notification.""" try: webhook_url = settings.SLACK_WEBHOOK_URL # Color based on severity colors = { AlertSeverity.INFO: '#36a64f', AlertSeverity.WARNING: '#ff9500', AlertSeverity.ERROR: '#ff0000', AlertSeverity.CRITICAL: '#990000' } payload = { 'text': f'{alert.severity.upper()}: {alert.title}', 'attachments': [{ 'color': colors.get(alert.severity, '#36a64f'), 'title': alert.title, 'text': alert.description, 'fields': [ {'title': 'Severity', 'value': alert.severity, 'short': True}, {'title': 'Category', 'value': alert.category, 'short': True}, {'title': 'Time', 'value': alert.created_at.strftime('%Y-%m-%d %H:%M:%S'), 'short': True}, {'title': 'Tenant', 'value': alert.tenant, 'short': True}, ], 'footer': 'Malaysian SME Platform Alert System', 'ts': int(alert.created_at.timestamp()) }] } response = requests.post(webhook_url, json=payload, timeout=10) response.raise_for_status() logger.info(f"Slack notification sent for alert: {alert.id}") except Exception as e: logger.error(f"Failed to send Slack notification: {e}") class SMSNotifier: """SMS notification system for critical alerts.""" def should_notify(self, alert: Alert) -> bool: """Check if should send SMS notification.""" # Only send SMS for critical alerts return alert.severity == AlertSeverity.CRITICAL def send(self, alert: Alert): """Send SMS notification.""" try: # This would integrate with Malaysian SMS service # For now, just log the attempt logger.info(f"SMS notification would be sent for critical alert: {alert.id}") # Example integration with Malaysian SMS service # sms_api_url = settings.SMS_API_URL # api_key = settings.SMS_API_KEY # recipients = settings.CRITICAL_ALERT_SMS_RECIPIENTS # message = f"CRITICAL: {alert.title}. {alert.description[:100]}" # payload = { # 'api_key': api_key, # 'recipients': recipients, # 'message': message # } # response = requests.post(sms_api_url, json=payload, timeout=10) # response.raise_for_status() except Exception as e: logger.error(f"Failed to send SMS notification: {e}") # Global alert manager instance alert_manager = AlertManager()