Files
multitenetsaas/monitoring/alert_rules.yml
AHMET YILMAZ b3fff546e9
Some checks failed
System Monitoring / Health Checks (push) Has been cancelled
System Monitoring / Performance Monitoring (push) Has been cancelled
System Monitoring / Database Monitoring (push) Has been cancelled
System Monitoring / Cache Monitoring (push) Has been cancelled
System Monitoring / Log Monitoring (push) Has been cancelled
System Monitoring / Resource Monitoring (push) Has been cancelled
System Monitoring / Uptime Monitoring (push) Has been cancelled
System Monitoring / Backup Monitoring (push) Has been cancelled
System Monitoring / Security Monitoring (push) Has been cancelled
System Monitoring / Monitoring Dashboard (push) Has been cancelled
System Monitoring / Alerting (push) Has been cancelled
Security Scanning / Dependency Scanning (push) Has been cancelled
Security Scanning / Code Security Scanning (push) Has been cancelled
Security Scanning / Secrets Scanning (push) Has been cancelled
Security Scanning / Container Security Scanning (push) Has been cancelled
Security Scanning / Compliance Checking (push) Has been cancelled
Security Scanning / Security Dashboard (push) Has been cancelled
Security Scanning / Security Remediation (push) Has been cancelled
project initialization
2025-10-05 02:37:33 +08:00

282 lines
9.3 KiB
YAML

groups:
- name: system
rules:
- alert: HighCPUUsage
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU usage on {{ $labels.instance }}"
description: "CPU usage is {{ $value }}% for more than 5 minutes"
- alert: CriticalCPUUsage
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
for: 2m
labels:
severity: critical
annotations:
summary: "Critical CPU usage on {{ $labels.instance }}"
description: "CPU usage is {{ $value }}% for more than 2 minutes"
- alert: HighMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage on {{ $labels.instance }}"
description: "Memory usage is {{ $value }}% for more than 5 minutes"
- alert: CriticalMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 90
for: 2m
labels:
severity: critical
annotations:
summary: "Critical memory usage on {{ $labels.instance }}"
description: "Memory usage is {{ $value }}% for more than 2 minutes"
- alert: LowDiskSpace
expr: (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)) * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "Low disk space on {{ $labels.instance }} {{ $labels.mountpoint }}"
description: "Disk usage is {{ $value }}% for more than 5 minutes"
- alert: CriticalDiskSpace
expr: (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)) * 100 > 90
for: 2m
labels:
severity: critical
annotations:
summary: "Critical disk space on {{ $labels.instance }} {{ $labels.mountpoint }}"
description: "Disk usage is {{ $value }}% for more than 2 minutes"
- name: application
rules:
- alert: ApplicationDown
expr: up{job="backend"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Application is down"
description: "Backend application on {{ $labels.instance }} has been down for more than 1 minute"
- alert: HighResponseTime
expr: histogram_quantile(0.95, http_request_duration_seconds_bucket{job="backend"}) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "High response time detected"
description: "95th percentile response time is {{ $value }} seconds for more than 5 minutes"
- alert: CriticalResponseTime
expr: histogram_quantile(0.95, http_request_duration_seconds_bucket{job="backend"}) > 5
for: 2m
labels:
severity: critical
annotations:
summary: "Critical response time detected"
description: "95th percentile response time is {{ $value }} seconds for more than 2 minutes"
- alert: HighErrorRate
expr: (rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m])) * 100 > 5
for: 5m
labels:
severity: warning
annotations:
summary: "High error rate detected"
description: "HTTP 5xx error rate is {{ $value }}% for more than 5 minutes"
- alert: CriticalErrorRate
expr: (rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m])) * 100 > 10
for: 2m
labels:
severity: critical
annotations:
summary: "Critical error rate detected"
description: "HTTP 5xx error rate is {{ $value }}% for more than 2 minutes"
- alert: DatabaseConnectionErrors
expr: rate(database_connection_errors_total[5m]) > 0
for: 2m
labels:
severity: critical
annotations:
summary: "Database connection errors detected"
description: "Database connection errors rate is {{ $value }} per second"
- name: database
rules:
- alert: PostgreSQLDown
expr: up{job="postgres"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "PostgreSQL is down"
description: "PostgreSQL database on {{ $labels.instance }} has been down for more than 1 minute"
- alert: HighDatabaseConnections
expr: pg_stat_database_numbackends / pg_settings_max_connections * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High database connections"
description: "Database connection usage is {{ $value }}% for more than 5 minutes"
- alert: CriticalDatabaseConnections
expr: pg_stat_database_numbackends / pg_settings_max_connections * 100 > 90
for: 2m
labels:
severity: critical
annotations:
summary: "Critical database connections"
description: "Database connection usage is {{ $value }}% for more than 2 minutes"
- alert: SlowQueries
expr: rate(pg_stat_database_calls_total[5m]) > 100
for: 5m
labels:
severity: warning
annotations:
summary: "High number of slow queries"
description: "Database slow queries rate is {{ $value }} per second"
- alert: DatabaseDeadlocks
expr: rate(pg_stat_database_deadlocks[5m]) > 0
for: 2m
labels:
severity: critical
annotations:
summary: "Database deadlocks detected"
description: "Database deadlock rate is {{ $value }} per second"
- name: cache
rules:
- alert: RedisDown
expr: up{job="redis"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Redis is down"
description: "Redis cache on {{ $labels.instance }} has been down for more than 1 minute"
- alert: HighRedisMemoryUsage
expr: (redis_memory_used_bytes / redis_memory_max_bytes) * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High Redis memory usage"
description: "Redis memory usage is {{ $value }}% for more than 5 minutes"
- alert: CriticalRedisMemoryUsage
expr: (redis_memory_used_bytes / redis_memory_max_bytes) * 100 > 90
for: 2m
labels:
severity: critical
annotations:
summary: "Critical Redis memory usage"
description: "Redis memory usage is {{ $value }}% for more than 2 minutes"
- alert: RedisConnectionErrors
expr: rate(redis_connection_errors_total[5m]) > 0
for: 2m
labels:
severity: critical
annotations:
summary: "Redis connection errors detected"
description: "Redis connection errors rate is {{ $value }} per second"
- name: business
rules:
- alert: LowActiveUsers
expr: active_users < 100
for: 30m
labels:
severity: info
annotations:
summary: "Low number of active users"
description: "Only {{ $value }} active users detected in the last 30 minutes"
- alert: HighFailedLogins
expr: rate(auth_failed_logins_total[5m]) > 10
for: 5m
labels:
severity: warning
annotations:
summary: "High number of failed login attempts"
description: "Failed login attempts rate is {{ $value }} per second for more than 5 minutes"
- alert: PaymentProcessingFailures
expr: rate(payment_failures_total[5m]) > 0
for: 2m
labels:
severity: critical
annotations:
summary: "Payment processing failures detected"
description: "Payment processing failure rate is {{ $value }} per second"
- alert: TenantResourceQuotaExceeded
expr: tenant_resource_usage / tenant_resource_quota * 100 > 90
for: 5m
labels:
severity: warning
annotations:
summary: "Tenant resource quota exceeded"
description: "Tenant {{ $labels.tenant_id }} is using {{ $value }}% of their resource quota"
- alert: MalaysianServiceDegradation
expr: malaysian_service_availability < 0.99
for: 5m
labels:
severity: warning
annotations:
summary: "Malaysian service degradation detected"
description: "Malaysian service availability is {{ $value }}% for more than 5 minutes"
- name: security
rules:
- alert: SuspiciousLoginActivity
expr: rate(suspicious_login_attempts_total[5m]) > 5
for: 2m
labels:
severity: warning
annotations:
summary: "Suspicious login activity detected"
description: "Suspicious login attempts rate is {{ $value }} per second"
- alert: BruteForceAttack
expr: rate(auth_failed_logins_total{ip!=""}[5m]) > 20
for: 1m
labels:
severity: critical
annotations:
summary: "Potential brute force attack detected"
description: "High rate of failed logins from IP {{ $labels.ip }}"
- alert: SQLInjectionAttempt
expr: sql_injection_attempts_total > 0
for: 1m
labels:
severity: critical
annotations:
summary: "SQL injection attempt detected"
description: "SQL injection attempt detected from {{ $labels.ip }}"
- alert: XSSAttempt
expr: xss_attempts_total > 0
for: 1m
labels:
severity: critical
annotations:
summary: "XSS attempt detected"
description: "Cross-site scripting attempt detected from {{ $labels.ip }}"