Some checks failed
System Monitoring / Health Checks (push) Has been cancelled
System Monitoring / Performance Monitoring (push) Has been cancelled
System Monitoring / Database Monitoring (push) Has been cancelled
System Monitoring / Cache Monitoring (push) Has been cancelled
System Monitoring / Log Monitoring (push) Has been cancelled
System Monitoring / Resource Monitoring (push) Has been cancelled
System Monitoring / Uptime Monitoring (push) Has been cancelled
System Monitoring / Backup Monitoring (push) Has been cancelled
System Monitoring / Security Monitoring (push) Has been cancelled
System Monitoring / Monitoring Dashboard (push) Has been cancelled
System Monitoring / Alerting (push) Has been cancelled
Security Scanning / Dependency Scanning (push) Has been cancelled
Security Scanning / Code Security Scanning (push) Has been cancelled
Security Scanning / Secrets Scanning (push) Has been cancelled
Security Scanning / Container Security Scanning (push) Has been cancelled
Security Scanning / Compliance Checking (push) Has been cancelled
Security Scanning / Security Dashboard (push) Has been cancelled
Security Scanning / Security Remediation (push) Has been cancelled
584 lines
20 KiB
Python
584 lines
20 KiB
Python
"""
|
|
Alert management system for the Malaysian SME SaaS platform.
|
|
Provides comprehensive alerting with Malaysian context.
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import smtplib
|
|
import requests
|
|
from datetime import datetime, timedelta
|
|
from typing import Dict, List, Any, Optional, Callable
|
|
from email.mime.text import MimeText
|
|
from email.mime.multipart import MimeMultipart
|
|
from django.conf import settings
|
|
from django.core.mail import send_mail
|
|
from django.utils import timezone
|
|
from django.db import connection
|
|
from prometheus_client import Counter, Gauge
|
|
import redis
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Alert metrics
|
|
ALERTS_TOTAL = Counter('alerts_total', 'Total alerts generated', ['severity', 'category'])
|
|
ALERTS_RESOLVED = Counter('alerts_resolved_total', 'Total alerts resolved', ['severity', 'category'])
|
|
ALERTS_ACTIVE = Gauge('alerts_active', 'Currently active alerts', ['severity', 'category'])
|
|
|
|
class AlertSeverity:
|
|
"""Alert severity levels."""
|
|
INFO = 'info'
|
|
WARNING = 'warning'
|
|
ERROR = 'error'
|
|
CRITICAL = 'critical'
|
|
|
|
class AlertCategory:
|
|
"""Alert categories."""
|
|
SYSTEM = 'system'
|
|
APPLICATION = 'application'
|
|
DATABASE = 'database'
|
|
CACHE = 'cache'
|
|
SECURITY = 'security'
|
|
BUSINESS = 'business'
|
|
MALAYSIAN = 'malaysian'
|
|
|
|
class Alert:
|
|
"""Single alert instance."""
|
|
|
|
def __init__(
|
|
self,
|
|
title: str,
|
|
description: str,
|
|
severity: str,
|
|
category: str,
|
|
metadata: Optional[Dict[str, Any]] = None
|
|
):
|
|
self.id = f"{int(timezone.now().timestamp())}-{hash(title)}"
|
|
self.title = title
|
|
self.description = description
|
|
self.severity = severity
|
|
self.category = category
|
|
self.metadata = metadata or {}
|
|
self.created_at = timezone.now()
|
|
self.resolved_at = None
|
|
self.acknowledged_at = None
|
|
self.acknowledged_by = None
|
|
self.tags = self.metadata.get('tags', [])
|
|
self.source = self.metadata.get('source', 'system')
|
|
self.tenant = self.metadata.get('tenant', 'all')
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
"""Convert alert to dictionary."""
|
|
return {
|
|
'id': self.id,
|
|
'title': self.title,
|
|
'description': self.description,
|
|
'severity': self.severity,
|
|
'category': self.category,
|
|
'metadata': self.metadata,
|
|
'created_at': self.created_at.isoformat(),
|
|
'resolved_at': self.resolved_at.isoformat() if self.resolved_at else None,
|
|
'acknowledged_at': self.acknowledged_at.isoformat() if self.acknowledged_at else None,
|
|
'acknowledged_by': self.acknowledged_by,
|
|
'tags': self.tags,
|
|
'source': self.source,
|
|
'tenant': self.tenant,
|
|
'status': self.get_status(),
|
|
}
|
|
|
|
def get_status(self) -> str:
|
|
"""Get alert status."""
|
|
if self.resolved_at:
|
|
return 'resolved'
|
|
elif self.acknowledged_at:
|
|
return 'acknowledged'
|
|
else:
|
|
return 'active'
|
|
|
|
def acknowledge(self, user: str):
|
|
"""Acknowledge alert."""
|
|
self.acknowledged_at = timezone.now()
|
|
self.acknowledged_by = user
|
|
logger.info(f"Alert {self.id} acknowledged by {user}")
|
|
|
|
def resolve(self, user: Optional[str] = None):
|
|
"""Resolve alert."""
|
|
self.resolved_at = timezone.now()
|
|
if user:
|
|
self.acknowledged_by = user
|
|
logger.info(f"Alert {self.id} resolved by {user or 'system'}")
|
|
|
|
# Update metrics
|
|
ALERTS_RESOLVED.labels(
|
|
severity=self.severity,
|
|
category=self.category
|
|
).inc()
|
|
|
|
class AlertRule:
|
|
"""Alert rule definition."""
|
|
|
|
def __init__(
|
|
self,
|
|
name: str,
|
|
condition: Callable[[], bool],
|
|
title_template: str,
|
|
description_template: str,
|
|
severity: str,
|
|
category: str,
|
|
cooldown_minutes: int = 15,
|
|
enabled: bool = True
|
|
):
|
|
self.name = name
|
|
self.condition = condition
|
|
self.title_template = title_template
|
|
self.description_template = description_template
|
|
self.severity = severity
|
|
self.category = category
|
|
self.cooldown_minutes = cooldown_minutes
|
|
self.enabled = enabled
|
|
self.last_triggered = None
|
|
self.metadata = {}
|
|
|
|
def should_trigger(self) -> bool:
|
|
"""Check if rule should trigger alert."""
|
|
if not self.enabled:
|
|
return False
|
|
|
|
# Check cooldown
|
|
if self.last_triggered:
|
|
cooldown_until = self.last_triggered + timedelta(minutes=self.cooldown_minutes)
|
|
if timezone.now() < cooldown_until:
|
|
return False
|
|
|
|
# Check condition
|
|
try:
|
|
return self.condition()
|
|
except Exception as e:
|
|
logger.error(f"Error checking alert rule {self.name}: {e}")
|
|
return False
|
|
|
|
def trigger(self, metadata: Optional[Dict[str, Any]] = None) -> Alert:
|
|
"""Trigger alert from rule."""
|
|
self.last_triggered = timezone.now()
|
|
self.metadata = metadata or {}
|
|
|
|
# Format title and description
|
|
title = self.title_template.format(**self.metadata)
|
|
description = self.description_template.format(**self.metadata)
|
|
|
|
return Alert(
|
|
title=title,
|
|
description=description,
|
|
severity=self.severity,
|
|
category=self.category,
|
|
metadata={**self.metadata, 'rule_name': self.name}
|
|
)
|
|
|
|
class AlertManager:
|
|
"""Main alert management system."""
|
|
|
|
def __init__(self):
|
|
self.rules: List[AlertRule] = []
|
|
self.active_alerts: Dict[str, Alert] = {}
|
|
self.alert_history: List[Alert] = []
|
|
self.notifiers = []
|
|
self.redis_client = None
|
|
self.initialize_redis()
|
|
self.setup_default_rules()
|
|
self.setup_notifiers()
|
|
|
|
def initialize_redis(self):
|
|
"""Initialize Redis connection for alert persistence."""
|
|
try:
|
|
self.redis_client = redis.from_url(settings.REDIS_URL)
|
|
except Exception as e:
|
|
logger.warning(f"Failed to initialize Redis for alerts: {e}")
|
|
|
|
def setup_default_rules(self):
|
|
"""Setup default alert rules."""
|
|
# System alerts
|
|
self.add_rule(AlertRule(
|
|
name='high_cpu_usage',
|
|
condition=self._check_high_cpu_usage,
|
|
title_template='High CPU Usage Detected',
|
|
description_template='CPU usage is {cpu_usage}% on server {server}',
|
|
severity=AlertSeverity.WARNING,
|
|
category=AlertCategory.SYSTEM,
|
|
cooldown_minutes=10
|
|
))
|
|
|
|
self.add_rule(AlertRule(
|
|
name='critical_cpu_usage',
|
|
condition=self._check_critical_cpu_usage,
|
|
title_template='Critical CPU Usage',
|
|
description_template='CPU usage is {cpu_usage}% on server {server} - immediate attention required',
|
|
severity=AlertSeverity.CRITICAL,
|
|
category=AlertCategory.SYSTEM,
|
|
cooldown_minutes=5
|
|
))
|
|
|
|
# Database alerts
|
|
self.add_rule(AlertRule(
|
|
name='database_connection_errors',
|
|
condition=self._check_database_connection_errors,
|
|
title_template='Database Connection Errors',
|
|
description_template='Database connection errors detected: {error_count} errors in the last 5 minutes',
|
|
severity=AlertSeverity.ERROR,
|
|
category=AlertCategory.DATABASE,
|
|
cooldown_minutes=5
|
|
))
|
|
|
|
# Application alerts
|
|
self.add_rule(AlertRule(
|
|
name='high_error_rate',
|
|
condition=self._check_high_error_rate,
|
|
title_template='High Application Error Rate',
|
|
description_template='Application error rate is {error_rate}% (threshold: 5%)',
|
|
severity=AlertSeverity.WARNING,
|
|
category=AlertCategory.APPLICATION,
|
|
cooldown_minutes=15
|
|
))
|
|
|
|
# Business alerts
|
|
self.add_rule(AlertRule(
|
|
name='low_active_users',
|
|
condition=self._check_low_active_users,
|
|
title_template='Low Active Users',
|
|
description_template='Only {active_users} active users detected (threshold: {threshold})',
|
|
severity=AlertSeverity.INFO,
|
|
category=AlertCategory.BUSINESS,
|
|
cooldown_minutes=60
|
|
))
|
|
|
|
# Malaysian-specific alerts
|
|
self.add_rule(AlertRule(
|
|
name='malaysian_service_degradation',
|
|
condition=self._check_malaysian_service_degradation,
|
|
title_template='Malaysian Service Degradation',
|
|
description_template='Malaysian service availability is {availability}% (threshold: 99%)',
|
|
severity=AlertSeverity.WARNING,
|
|
category=AlertCategory.MALAYSIAN,
|
|
cooldown_minutes=10
|
|
))
|
|
|
|
# Security alerts
|
|
self.add_rule(AlertRule(
|
|
name='suspicious_login_activity',
|
|
condition=self._check_suspicious_login_activity,
|
|
title_template='Suspicious Login Activity',
|
|
description_template='Detected {failed_logins} failed login attempts from IP {ip_address}',
|
|
severity=AlertSeverity.WARNING,
|
|
category=AlertCategory.SECURITY,
|
|
cooldown_minutes=15
|
|
))
|
|
|
|
def setup_notifiers(self):
|
|
"""Setup notification channels."""
|
|
# Email notifier
|
|
if settings.EMAIL_HOST:
|
|
self.add_notifier(EmailNotifier())
|
|
|
|
# Slack notifier
|
|
if hasattr(settings, 'SLACK_WEBHOOK_URL'):
|
|
self.add_notifier(SlackNotifier())
|
|
|
|
# SMS notifier for critical alerts (Malaysian numbers)
|
|
if hasattr(settings, 'SMS_API_KEY'):
|
|
self.add_notifier(SMSNotifier())
|
|
|
|
def add_rule(self, rule: AlertRule):
|
|
"""Add alert rule."""
|
|
self.rules.append(rule)
|
|
logger.info(f"Added alert rule: {rule.name}")
|
|
|
|
def add_notifier(self, notifier):
|
|
"""Add notification channel."""
|
|
self.notifiers.append(notifier)
|
|
logger.info(f"Added notifier: {notifier.__class__.__name__}")
|
|
|
|
def check_rules(self):
|
|
"""Check all alert rules and trigger if needed."""
|
|
for rule in self.rules:
|
|
try:
|
|
if rule.should_trigger():
|
|
alert = rule.trigger()
|
|
self.trigger_alert(alert)
|
|
except Exception as e:
|
|
logger.error(f"Error checking rule {rule.name}: {e}")
|
|
|
|
def trigger_alert(self, alert: Alert):
|
|
"""Trigger new alert."""
|
|
# Check if similar active alert exists
|
|
for existing_alert in self.active_alerts.values():
|
|
if (existing_alert.title == alert.title and
|
|
existing_alert.severity == alert.severity and
|
|
existing_alert.get_status() == 'active'):
|
|
logger.debug(f"Similar alert already active: {existing_alert.id}")
|
|
return
|
|
|
|
# Add alert
|
|
self.active_alerts[alert.id] = alert
|
|
self.alert_history.append(alert)
|
|
|
|
# Update metrics
|
|
ALERTS_TOTAL.labels(
|
|
severity=alert.severity,
|
|
category=alert.category
|
|
).inc()
|
|
|
|
# Keep only recent history
|
|
if len(self.alert_history) > 1000:
|
|
self.alert_history = self.alert_history[-1000:]
|
|
|
|
# Store in Redis
|
|
if self.redis_client:
|
|
try:
|
|
self.redis_client.setex(
|
|
f"alert:{alert.id}",
|
|
86400, # 24 hours
|
|
json.dumps(alert.to_dict())
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Failed to store alert in Redis: {e}")
|
|
|
|
# Send notifications
|
|
self.send_notifications(alert)
|
|
|
|
logger.warning(f"Alert triggered: {alert.title} ({alert.severity})")
|
|
|
|
def resolve_alert(self, alert_id: str, user: Optional[str] = None):
|
|
"""Resolve alert."""
|
|
if alert_id in self.active_alerts:
|
|
alert = self.active_alerts[alert_id]
|
|
alert.resolve(user)
|
|
del self.active_alerts[alert_id]
|
|
|
|
# Update Redis
|
|
if self.redis_client:
|
|
try:
|
|
self.redis_client.delete(f"alert:{alert_id}")
|
|
except Exception as e:
|
|
logger.error(f"Failed to delete alert from Redis: {e}")
|
|
|
|
logger.info(f"Alert resolved: {alert.title}")
|
|
|
|
def acknowledge_alert(self, alert_id: str, user: str):
|
|
"""Acknowledge alert."""
|
|
if alert_id in self.active_alerts:
|
|
alert = self.active_alerts[alert_id]
|
|
alert.acknowledge(user)
|
|
logger.info(f"Alert acknowledged: {alert.title} by {user}")
|
|
|
|
def get_active_alerts(self, severity: Optional[str] = None, category: Optional[str] = None) -> List[Alert]:
|
|
"""Get active alerts with optional filtering."""
|
|
alerts = list(self.active_alerts.values())
|
|
|
|
if severity:
|
|
alerts = [a for a in alerts if a.severity == severity]
|
|
|
|
if category:
|
|
alerts = [a for a in alerts if a.category == category]
|
|
|
|
return alerts
|
|
|
|
def get_alert_history(self, hours: int = 24) -> List[Alert]:
|
|
"""Get alert history for specified hours."""
|
|
since = timezone.now() - timedelta(hours=hours)
|
|
return [a for a in self.alert_history if a.created_at >= since]
|
|
|
|
def send_notifications(self, alert: Alert):
|
|
"""Send alert notifications."""
|
|
for notifier in self.notifiers:
|
|
try:
|
|
if notifier.should_notify(alert):
|
|
notifier.send(alert)
|
|
except Exception as e:
|
|
logger.error(f"Error sending notification via {notifier.__class__.__name__}: {e}")
|
|
|
|
# Alert condition methods
|
|
def _check_high_cpu_usage(self) -> bool:
|
|
"""Check for high CPU usage."""
|
|
try:
|
|
import psutil
|
|
cpu_usage = psutil.cpu_percent(interval=1)
|
|
return cpu_usage > 80
|
|
except Exception:
|
|
return False
|
|
|
|
def _check_critical_cpu_usage(self) -> bool:
|
|
"""Check for critical CPU usage."""
|
|
try:
|
|
import psutil
|
|
cpu_usage = psutil.cpu_percent(interval=1)
|
|
return cpu_usage > 90
|
|
except Exception:
|
|
return False
|
|
|
|
def _check_database_connection_errors(self) -> bool:
|
|
"""Check for database connection errors."""
|
|
try:
|
|
# This would integrate with your error tracking system
|
|
# For now, return False as placeholder
|
|
return False
|
|
except Exception:
|
|
return False
|
|
|
|
def _check_high_error_rate(self) -> bool:
|
|
"""Check for high application error rate."""
|
|
try:
|
|
# This would check application error rates
|
|
# For now, return False as placeholder
|
|
return False
|
|
except Exception:
|
|
return False
|
|
|
|
def _check_low_active_users(self) -> bool:
|
|
"""Check for low active users."""
|
|
try:
|
|
from django.contrib.auth import get_user_model
|
|
User = get_user_model()
|
|
five_minutes_ago = timezone.now() - timedelta(minutes=5)
|
|
active_count = User.objects.filter(
|
|
last_login__gte=five_minutes_ago,
|
|
is_active=True
|
|
).count()
|
|
return active_count < 10
|
|
except Exception:
|
|
return False
|
|
|
|
def _check_malaysian_service_degradation(self) -> bool:
|
|
"""Check for Malaysian service degradation."""
|
|
try:
|
|
# This would check Malaysian-specific service health
|
|
# For now, return False as placeholder
|
|
return False
|
|
except Exception:
|
|
return False
|
|
|
|
def _check_suspicious_login_activity(self) -> bool:
|
|
"""Check for suspicious login activity."""
|
|
try:
|
|
# This would check for suspicious login patterns
|
|
# For now, return False as placeholder
|
|
return False
|
|
except Exception:
|
|
return False
|
|
|
|
class EmailNotifier:
|
|
"""Email notification system."""
|
|
|
|
def should_notify(self, alert: Alert) -> bool:
|
|
"""Check if should send email notification."""
|
|
# Send emails for warnings and above
|
|
return alert.severity in [AlertSeverity.WARNING, AlertSeverity.ERROR, AlertSeverity.CRITICAL]
|
|
|
|
def send(self, alert: Alert):
|
|
"""Send email notification."""
|
|
try:
|
|
subject = f"[{alert.severity.upper()}] {alert.title}"
|
|
message = f"""
|
|
Alert Details:
|
|
- Title: {alert.title}
|
|
- Severity: {alert.severity}
|
|
- Category: {alert.category}
|
|
- Description: {alert.description}
|
|
- Time: {alert.created_at}
|
|
- Source: {alert.source}
|
|
- Tenant: {alert.tenant}
|
|
|
|
Additional Information:
|
|
{json.dumps(alert.metadata, indent=2)}
|
|
"""
|
|
|
|
send_mail(
|
|
subject,
|
|
message,
|
|
settings.DEFAULT_FROM_EMAIL,
|
|
settings.ALERT_EMAIL_RECIPIENTS,
|
|
fail_silently=False
|
|
)
|
|
|
|
logger.info(f"Email notification sent for alert: {alert.id}")
|
|
except Exception as e:
|
|
logger.error(f"Failed to send email notification: {e}")
|
|
|
|
class SlackNotifier:
|
|
"""Slack notification system."""
|
|
|
|
def should_notify(self, alert: Alert) -> bool:
|
|
"""Check if should send Slack notification."""
|
|
# Send Slack for all alerts
|
|
return True
|
|
|
|
def send(self, alert: Alert):
|
|
"""Send Slack notification."""
|
|
try:
|
|
webhook_url = settings.SLACK_WEBHOOK_URL
|
|
|
|
# Color based on severity
|
|
colors = {
|
|
AlertSeverity.INFO: '#36a64f',
|
|
AlertSeverity.WARNING: '#ff9500',
|
|
AlertSeverity.ERROR: '#ff0000',
|
|
AlertSeverity.CRITICAL: '#990000'
|
|
}
|
|
|
|
payload = {
|
|
'text': f'{alert.severity.upper()}: {alert.title}',
|
|
'attachments': [{
|
|
'color': colors.get(alert.severity, '#36a64f'),
|
|
'title': alert.title,
|
|
'text': alert.description,
|
|
'fields': [
|
|
{'title': 'Severity', 'value': alert.severity, 'short': True},
|
|
{'title': 'Category', 'value': alert.category, 'short': True},
|
|
{'title': 'Time', 'value': alert.created_at.strftime('%Y-%m-%d %H:%M:%S'), 'short': True},
|
|
{'title': 'Tenant', 'value': alert.tenant, 'short': True},
|
|
],
|
|
'footer': 'Malaysian SME Platform Alert System',
|
|
'ts': int(alert.created_at.timestamp())
|
|
}]
|
|
}
|
|
|
|
response = requests.post(webhook_url, json=payload, timeout=10)
|
|
response.raise_for_status()
|
|
|
|
logger.info(f"Slack notification sent for alert: {alert.id}")
|
|
except Exception as e:
|
|
logger.error(f"Failed to send Slack notification: {e}")
|
|
|
|
class SMSNotifier:
|
|
"""SMS notification system for critical alerts."""
|
|
|
|
def should_notify(self, alert: Alert) -> bool:
|
|
"""Check if should send SMS notification."""
|
|
# Only send SMS for critical alerts
|
|
return alert.severity == AlertSeverity.CRITICAL
|
|
|
|
def send(self, alert: Alert):
|
|
"""Send SMS notification."""
|
|
try:
|
|
# This would integrate with Malaysian SMS service
|
|
# For now, just log the attempt
|
|
logger.info(f"SMS notification would be sent for critical alert: {alert.id}")
|
|
|
|
# Example integration with Malaysian SMS service
|
|
# sms_api_url = settings.SMS_API_URL
|
|
# api_key = settings.SMS_API_KEY
|
|
# recipients = settings.CRITICAL_ALERT_SMS_RECIPIENTS
|
|
|
|
# message = f"CRITICAL: {alert.title}. {alert.description[:100]}"
|
|
# payload = {
|
|
# 'api_key': api_key,
|
|
# 'recipients': recipients,
|
|
# 'message': message
|
|
# }
|
|
|
|
# response = requests.post(sms_api_url, json=payload, timeout=10)
|
|
# response.raise_for_status()
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to send SMS notification: {e}")
|
|
|
|
# Global alert manager instance
|
|
alert_manager = AlertManager() |