Some checks failed
System Monitoring / Health Checks (push) Has been cancelled
System Monitoring / Performance Monitoring (push) Has been cancelled
System Monitoring / Database Monitoring (push) Has been cancelled
System Monitoring / Cache Monitoring (push) Has been cancelled
System Monitoring / Log Monitoring (push) Has been cancelled
System Monitoring / Resource Monitoring (push) Has been cancelled
System Monitoring / Uptime Monitoring (push) Has been cancelled
System Monitoring / Backup Monitoring (push) Has been cancelled
System Monitoring / Security Monitoring (push) Has been cancelled
System Monitoring / Monitoring Dashboard (push) Has been cancelled
System Monitoring / Alerting (push) Has been cancelled
Security Scanning / Dependency Scanning (push) Has been cancelled
Security Scanning / Code Security Scanning (push) Has been cancelled
Security Scanning / Secrets Scanning (push) Has been cancelled
Security Scanning / Container Security Scanning (push) Has been cancelled
Security Scanning / Compliance Checking (push) Has been cancelled
Security Scanning / Security Dashboard (push) Has been cancelled
Security Scanning / Security Remediation (push) Has been cancelled
297 lines
9.7 KiB
YAML
297 lines
9.7 KiB
YAML
groups:
|
|
- name: system
|
|
rules:
|
|
# CPU recording rules
|
|
- record: node_cpu_usage
|
|
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
|
|
|
|
- record: node_cpu_usage_iowait
|
|
expr: avg by(instance) (irate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100
|
|
|
|
- record: node_cpu_usage_system
|
|
expr: avg by(instance) (irate(node_cpu_seconds_total{mode="system"}[5m])) * 100
|
|
|
|
- record: node_cpu_usage_user
|
|
expr: avg by(instance) (irate(node_cpu_seconds_total{mode="user"}[5m])) * 100
|
|
|
|
# Memory recording rules
|
|
- record: node_memory_usage_percent
|
|
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100
|
|
|
|
- record: node_memory_usage_bytes
|
|
expr: node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes
|
|
|
|
- record: node_memory_cached_bytes
|
|
expr: node_memory_Cached_bytes + node_memory_Buffers_bytes
|
|
|
|
- record: node_memory_swap_usage_percent
|
|
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100
|
|
|
|
# Disk recording rules
|
|
- record: node_disk_usage_percent
|
|
expr: (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)) * 100
|
|
|
|
- record: node_disk_read_iops
|
|
expr: rate(node_disk_reads_completed_total[5m])
|
|
|
|
- record: node_disk_write_iops
|
|
expr: rate(node_disk_writes_completed_total[5m])
|
|
|
|
- record: node_disk_read_bytes
|
|
expr: rate(node_disk_read_bytes_total[5m])
|
|
|
|
- record: node_disk_write_bytes
|
|
expr: rate(node_disk_written_bytes_total[5m])
|
|
|
|
# Network recording rules
|
|
- record: node_network_receive_bytes
|
|
expr: rate(node_network_receive_bytes_total[5m])
|
|
|
|
- record: node_network_transmit_bytes
|
|
expr: rate(node_network_transmit_bytes_total[5m])
|
|
|
|
- record: node_network_receive_errors
|
|
expr: rate(node_network_receive_errs_total[5m])
|
|
|
|
- record: node_network_transmit_errors
|
|
expr: rate(node_network_transmit_errs_total[5m])
|
|
|
|
- name: application
|
|
rules:
|
|
# HTTP request metrics
|
|
- record: http_requests_per_second
|
|
expr: rate(http_requests_total[5m])
|
|
|
|
- record: http_requests_5xx_per_second
|
|
expr: rate(http_requests_total{status=~"5.."}[5m])
|
|
|
|
- record: http_requests_4xx_per_second
|
|
expr: rate(http_requests_total{status=~"4.."}[5m])
|
|
|
|
- record: http_error_rate_percent
|
|
expr: (rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m])) * 100
|
|
|
|
- record: http_response_time_p50
|
|
expr: histogram_quantile(0.50, http_request_duration_seconds_bucket)
|
|
|
|
- record: http_response_time_p95
|
|
expr: histogram_quantile(0.95, http_request_duration_seconds_bucket)
|
|
|
|
- record: http_response_time_p99
|
|
expr: histogram_quantile(0.99, http_request_duration_seconds_bucket)
|
|
|
|
# Application performance
|
|
- record: application_memory_usage_bytes
|
|
expr: process_resident_memory_bytes{job="backend"}
|
|
|
|
- record: application_cpu_usage_percent
|
|
expr: rate(process_cpu_seconds_total{job="backend"}[5m]) * 100
|
|
|
|
- record: application_uptime_seconds
|
|
expr: time() - process_start_time_seconds{job="backend"}
|
|
|
|
- record: application_gc_pause_seconds_total
|
|
expr: rate(process_go_gc_duration_seconds_sum{job="backend"}[5m])
|
|
|
|
- name: database
|
|
rules:
|
|
# PostgreSQL metrics
|
|
- record: pg_stat_database_calls_total_rate
|
|
expr: rate(pg_stat_database_calls_total[5m])
|
|
|
|
- record: pg_stat_database_rows_returned_rate
|
|
expr: rate(pg_stat_database_rows_returned_total[5m])
|
|
|
|
- record: pg_stat_database_rows_fetched_rate
|
|
expr: rate(pg_stat_database_rows_fetched_total[5m])
|
|
|
|
- record: pg_stat_database_rows_inserted_rate
|
|
expr: rate(pg_stat_database_rows_inserted_total[5m])
|
|
|
|
- record: pg_stat_database_rows_updated_rate
|
|
expr: rate(pg_stat_database_rows_updated_total[5m])
|
|
|
|
- record: pg_stat_database_rows_deleted_rate
|
|
expr: rate(pg_stat_database_rows_deleted_total[5m])
|
|
|
|
- record: pg_stat_database_connections_usage_percent
|
|
expr: (pg_stat_database_numbackends / pg_settings_max_connections) * 100
|
|
|
|
- record: pg_stat_database_deadlocks_rate
|
|
expr: rate(pg_stat_database_deadlocks[5m])
|
|
|
|
- record: pg_stat_database_temp_files_rate
|
|
expr: rate(pg_stat_database_temp_files_total[5m])
|
|
|
|
- record: pg_stat_database_temp_bytes_rate
|
|
expr: rate(pg_stat_database_temp_bytes_total[5m])
|
|
|
|
# Query performance
|
|
- record: pg_stat_statements_total_time_rate
|
|
expr: rate(pg_stat_statements_total_time_ms[5m])
|
|
|
|
- record: pg_stat_statements_mean_time_ms
|
|
expr: rate(pg_stat_statements_total_time_ms[5m]) / rate(pg_stat_statements_calls[5m])
|
|
|
|
- record: pg_stat_statements_rows_per_second
|
|
expr: rate(pg_stat_statements_rows[5m])
|
|
|
|
- name: cache
|
|
rules:
|
|
# Redis metrics
|
|
- record: redis_commands_per_second
|
|
expr: rate(redis_commands_total[5m])
|
|
|
|
- record: redis_keyspace_hits_per_second
|
|
expr: rate(redis_keyspace_hits_total[5m])
|
|
|
|
- record: redis_keyspace_misses_per_second
|
|
expr: rate(redis_keyspace_misses_total[5m])
|
|
|
|
- record: redis_keyspace_hit_rate_percent
|
|
expr: (rate(redis_keyspace_hits_total[5m]) / (rate(redis_keyspace_hits_total[5m]) + rate(redis_keyspace_misses_total[5m]))) * 100
|
|
|
|
- record: redis_memory_usage_percent
|
|
expr: (redis_memory_used_bytes / redis_memory_max_bytes) * 100
|
|
|
|
- record: redis_connected_clients
|
|
expr: redis_connected_clients
|
|
|
|
- record: redis_blocked_clients
|
|
expr: redis_blocked_clients
|
|
|
|
- record: redis_connections_received_per_second
|
|
expr: rate(redis_connections_received_total[5m])
|
|
|
|
- record: redis_connections_rejected_per_second
|
|
expr: rate(redis_connections_rejected_total[5m])
|
|
|
|
- record: redis_expired_keys_per_second
|
|
expr: rate(redis_expired_keys_total[5m])
|
|
|
|
- record: redis_evicted_keys_per_second
|
|
expr: rate(redis_evicted_keys_total[5m])
|
|
|
|
- name: business
|
|
rules:
|
|
# User metrics
|
|
- record: active_users
|
|
expr: count_changes(active_users_total[5m])
|
|
|
|
- record: new_users_per_hour
|
|
expr: rate(new_users_total[1h])
|
|
|
|
- record: user_sessions_active
|
|
expr: user_sessions_total
|
|
|
|
- record: user_sessions_per_second
|
|
expr: rate(user_sessions_total[5m])
|
|
|
|
# Authentication metrics
|
|
- record: auth_successful_logins_per_second
|
|
expr: rate(auth_successful_logins_total[5m])
|
|
|
|
- record: auth_failed_logins_per_second
|
|
expr: rate(auth_failed_logins_total[5m])
|
|
|
|
- record: auth_failed_login_rate_percent
|
|
expr: (rate(auth_failed_logins_total[5m]) / (rate(auth_successful_logins_total[5m]) + rate(auth_failed_logins_total[5m]))) * 100
|
|
|
|
# Tenant metrics
|
|
- record: tenant_active_count
|
|
expr: count(tenant_active_total == 1)
|
|
|
|
- record: tenant_users_per_tenant
|
|
expr: tenant_users_total
|
|
|
|
- record: tenant_resource_usage_percent
|
|
expr: (tenant_resource_usage_bytes / tenant_resource_quota_bytes) * 100
|
|
|
|
# Business metrics
|
|
- record: transactions_per_second
|
|
expr: rate(transactions_total[5m])
|
|
|
|
- record: transaction_success_rate_percent
|
|
expr: (rate(transaction_successful_total[5m]) / rate(transactions_total[5m])) * 100
|
|
|
|
- record: payment_success_rate_percent
|
|
expr: (rate(payment_successful_total[5m]) / rate(payment_attempts_total[5m])) * 100
|
|
|
|
- record: payment_failures_per_second
|
|
expr: rate(payment_failures_total[5m])
|
|
|
|
- record: revenue_per_hour
|
|
expr: rate(revenue_total[1h])
|
|
|
|
# Malaysian-specific metrics
|
|
- record: malaysian_users_active
|
|
expr: count_changes(malaysian_users_active_total[5m])
|
|
|
|
- record: malaysian_transactions_per_second
|
|
expr: rate(malaysian_transactions_total[5m])
|
|
|
|
- record: sst_calculations_per_second
|
|
expr: rate(sst_calculations_total[5m])
|
|
|
|
- record: ic_validations_per_second
|
|
expr: rate(ic_validations_total[5m])
|
|
|
|
- record: malaysian_postcode_lookups_per_second
|
|
expr: rate(malaysian_postcode_lookups_total[5m])
|
|
|
|
- name: security
|
|
rules:
|
|
# Security events
|
|
- record: security_events_per_second
|
|
expr: rate(security_events_total[5m])
|
|
|
|
- record: blocked_requests_per_second
|
|
expr: rate(blocked_requests_total[5m])
|
|
|
|
- record: suspicious_ip_requests_per_second
|
|
expr: rate(suspicious_ip_requests_total[5m])
|
|
|
|
- record: rate_limit_exceeded_per_second
|
|
expr: rate(rate_limit_exceeded_total[5m])
|
|
|
|
# Web application firewall
|
|
- record: waf_blocked_requests_per_second
|
|
expr: rate(waf_blocked_requests_total[5m])
|
|
|
|
- record: waf_sql_injection_attempts_per_second
|
|
expr: rate(waf_sql_injection_attempts_total[5m])
|
|
|
|
- record: waf_xss_attempts_per_second
|
|
expr: rate(waf_xss_attempts_total[5m])
|
|
|
|
- record: waf_path_traversal_attempts_per_second
|
|
expr: rate(waf_path_traversal_attempts_total[5m])
|
|
|
|
# Rate limiting
|
|
- record: rate_limit_429_responses_per_second
|
|
expr: rate(http_requests_total{status="429"}[5m])
|
|
|
|
- record: rate_limit_by_ip_per_second
|
|
expr: rate(rate_limit_by_ip_total[5m])
|
|
|
|
- name: sla
|
|
rules:
|
|
# Service Level Objectives
|
|
- record: slo_http_availability_percentage
|
|
expr: (1 - (rate(http_requests_total{status=~"5.."}[1h]) / rate(http_requests_total[1h]))) * 100
|
|
|
|
- record: slo_http_latency_p95_percentage
|
|
expr: (1 - (histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[1h])) / 2)) * 100
|
|
|
|
- record: slo_database_availability_percentage
|
|
expr: (up{job="postgres"} * 100)
|
|
|
|
- record: slo_cache_availability_percentage
|
|
expr: (up{job="redis"} * 100)
|
|
|
|
# Error budget
|
|
- record: error_budget_remaining_percentage
|
|
expr: (1 - (rate(http_requests_total{status=~"5.."}[30d]) / (rate(http_requests_total[30d]) * 0.01))) * 100
|
|
|
|
- record: error_budget_burn_rate
|
|
expr: (rate(http_requests_total{status=~"5.."}[1h]) / (rate(http_requests_total[1h]) * 0.01)) |