Some checks failed
System Monitoring / Health Checks (push) Has been cancelled
System Monitoring / Performance Monitoring (push) Has been cancelled
System Monitoring / Database Monitoring (push) Has been cancelled
System Monitoring / Cache Monitoring (push) Has been cancelled
System Monitoring / Log Monitoring (push) Has been cancelled
System Monitoring / Resource Monitoring (push) Has been cancelled
System Monitoring / Uptime Monitoring (push) Has been cancelled
System Monitoring / Backup Monitoring (push) Has been cancelled
System Monitoring / Security Monitoring (push) Has been cancelled
System Monitoring / Monitoring Dashboard (push) Has been cancelled
System Monitoring / Alerting (push) Has been cancelled
Security Scanning / Dependency Scanning (push) Has been cancelled
Security Scanning / Code Security Scanning (push) Has been cancelled
Security Scanning / Secrets Scanning (push) Has been cancelled
Security Scanning / Container Security Scanning (push) Has been cancelled
Security Scanning / Compliance Checking (push) Has been cancelled
Security Scanning / Security Dashboard (push) Has been cancelled
Security Scanning / Security Remediation (push) Has been cancelled
282 lines
9.3 KiB
YAML
282 lines
9.3 KiB
YAML
groups:
|
|
- name: system
|
|
rules:
|
|
- alert: HighCPUUsage
|
|
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High CPU usage on {{ $labels.instance }}"
|
|
description: "CPU usage is {{ $value }}% for more than 5 minutes"
|
|
|
|
- alert: CriticalCPUUsage
|
|
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Critical CPU usage on {{ $labels.instance }}"
|
|
description: "CPU usage is {{ $value }}% for more than 2 minutes"
|
|
|
|
- alert: HighMemoryUsage
|
|
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High memory usage on {{ $labels.instance }}"
|
|
description: "Memory usage is {{ $value }}% for more than 5 minutes"
|
|
|
|
- alert: CriticalMemoryUsage
|
|
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 90
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Critical memory usage on {{ $labels.instance }}"
|
|
description: "Memory usage is {{ $value }}% for more than 2 minutes"
|
|
|
|
- alert: LowDiskSpace
|
|
expr: (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)) * 100 > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Low disk space on {{ $labels.instance }} {{ $labels.mountpoint }}"
|
|
description: "Disk usage is {{ $value }}% for more than 5 minutes"
|
|
|
|
- alert: CriticalDiskSpace
|
|
expr: (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)) * 100 > 90
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Critical disk space on {{ $labels.instance }} {{ $labels.mountpoint }}"
|
|
description: "Disk usage is {{ $value }}% for more than 2 minutes"
|
|
|
|
- name: application
|
|
rules:
|
|
- alert: ApplicationDown
|
|
expr: up{job="backend"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Application is down"
|
|
description: "Backend application on {{ $labels.instance }} has been down for more than 1 minute"
|
|
|
|
- alert: HighResponseTime
|
|
expr: histogram_quantile(0.95, http_request_duration_seconds_bucket{job="backend"}) > 2
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High response time detected"
|
|
description: "95th percentile response time is {{ $value }} seconds for more than 5 minutes"
|
|
|
|
- alert: CriticalResponseTime
|
|
expr: histogram_quantile(0.95, http_request_duration_seconds_bucket{job="backend"}) > 5
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Critical response time detected"
|
|
description: "95th percentile response time is {{ $value }} seconds for more than 2 minutes"
|
|
|
|
- alert: HighErrorRate
|
|
expr: (rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m])) * 100 > 5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High error rate detected"
|
|
description: "HTTP 5xx error rate is {{ $value }}% for more than 5 minutes"
|
|
|
|
- alert: CriticalErrorRate
|
|
expr: (rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m])) * 100 > 10
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Critical error rate detected"
|
|
description: "HTTP 5xx error rate is {{ $value }}% for more than 2 minutes"
|
|
|
|
- alert: DatabaseConnectionErrors
|
|
expr: rate(database_connection_errors_total[5m]) > 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Database connection errors detected"
|
|
description: "Database connection errors rate is {{ $value }} per second"
|
|
|
|
- name: database
|
|
rules:
|
|
- alert: PostgreSQLDown
|
|
expr: up{job="postgres"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "PostgreSQL is down"
|
|
description: "PostgreSQL database on {{ $labels.instance }} has been down for more than 1 minute"
|
|
|
|
- alert: HighDatabaseConnections
|
|
expr: pg_stat_database_numbackends / pg_settings_max_connections * 100 > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High database connections"
|
|
description: "Database connection usage is {{ $value }}% for more than 5 minutes"
|
|
|
|
- alert: CriticalDatabaseConnections
|
|
expr: pg_stat_database_numbackends / pg_settings_max_connections * 100 > 90
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Critical database connections"
|
|
description: "Database connection usage is {{ $value }}% for more than 2 minutes"
|
|
|
|
- alert: SlowQueries
|
|
expr: rate(pg_stat_database_calls_total[5m]) > 100
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High number of slow queries"
|
|
description: "Database slow queries rate is {{ $value }} per second"
|
|
|
|
- alert: DatabaseDeadlocks
|
|
expr: rate(pg_stat_database_deadlocks[5m]) > 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Database deadlocks detected"
|
|
description: "Database deadlock rate is {{ $value }} per second"
|
|
|
|
- name: cache
|
|
rules:
|
|
- alert: RedisDown
|
|
expr: up{job="redis"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Redis is down"
|
|
description: "Redis cache on {{ $labels.instance }} has been down for more than 1 minute"
|
|
|
|
- alert: HighRedisMemoryUsage
|
|
expr: (redis_memory_used_bytes / redis_memory_max_bytes) * 100 > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High Redis memory usage"
|
|
description: "Redis memory usage is {{ $value }}% for more than 5 minutes"
|
|
|
|
- alert: CriticalRedisMemoryUsage
|
|
expr: (redis_memory_used_bytes / redis_memory_max_bytes) * 100 > 90
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Critical Redis memory usage"
|
|
description: "Redis memory usage is {{ $value }}% for more than 2 minutes"
|
|
|
|
- alert: RedisConnectionErrors
|
|
expr: rate(redis_connection_errors_total[5m]) > 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Redis connection errors detected"
|
|
description: "Redis connection errors rate is {{ $value }} per second"
|
|
|
|
- name: business
|
|
rules:
|
|
- alert: LowActiveUsers
|
|
expr: active_users < 100
|
|
for: 30m
|
|
labels:
|
|
severity: info
|
|
annotations:
|
|
summary: "Low number of active users"
|
|
description: "Only {{ $value }} active users detected in the last 30 minutes"
|
|
|
|
- alert: HighFailedLogins
|
|
expr: rate(auth_failed_logins_total[5m]) > 10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High number of failed login attempts"
|
|
description: "Failed login attempts rate is {{ $value }} per second for more than 5 minutes"
|
|
|
|
- alert: PaymentProcessingFailures
|
|
expr: rate(payment_failures_total[5m]) > 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Payment processing failures detected"
|
|
description: "Payment processing failure rate is {{ $value }} per second"
|
|
|
|
- alert: TenantResourceQuotaExceeded
|
|
expr: tenant_resource_usage / tenant_resource_quota * 100 > 90
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Tenant resource quota exceeded"
|
|
description: "Tenant {{ $labels.tenant_id }} is using {{ $value }}% of their resource quota"
|
|
|
|
- alert: MalaysianServiceDegradation
|
|
expr: malaysian_service_availability < 0.99
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Malaysian service degradation detected"
|
|
description: "Malaysian service availability is {{ $value }}% for more than 5 minutes"
|
|
|
|
- name: security
|
|
rules:
|
|
- alert: SuspiciousLoginActivity
|
|
expr: rate(suspicious_login_attempts_total[5m]) > 5
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Suspicious login activity detected"
|
|
description: "Suspicious login attempts rate is {{ $value }} per second"
|
|
|
|
- alert: BruteForceAttack
|
|
expr: rate(auth_failed_logins_total{ip!=""}[5m]) > 20
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Potential brute force attack detected"
|
|
description: "High rate of failed logins from IP {{ $labels.ip }}"
|
|
|
|
- alert: SQLInjectionAttempt
|
|
expr: sql_injection_attempts_total > 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "SQL injection attempt detected"
|
|
description: "SQL injection attempt detected from {{ $labels.ip }}"
|
|
|
|
- alert: XSSAttempt
|
|
expr: xss_attempts_total > 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "XSS attempt detected"
|
|
description: "Cross-site scripting attempt detected from {{ $labels.ip }}" |