Files
multitenetsaas/monitoring/recording_rules.yml
AHMET YILMAZ b3fff546e9
Some checks failed
System Monitoring / Health Checks (push) Has been cancelled
System Monitoring / Performance Monitoring (push) Has been cancelled
System Monitoring / Database Monitoring (push) Has been cancelled
System Monitoring / Cache Monitoring (push) Has been cancelled
System Monitoring / Log Monitoring (push) Has been cancelled
System Monitoring / Resource Monitoring (push) Has been cancelled
System Monitoring / Uptime Monitoring (push) Has been cancelled
System Monitoring / Backup Monitoring (push) Has been cancelled
System Monitoring / Security Monitoring (push) Has been cancelled
System Monitoring / Monitoring Dashboard (push) Has been cancelled
System Monitoring / Alerting (push) Has been cancelled
Security Scanning / Dependency Scanning (push) Has been cancelled
Security Scanning / Code Security Scanning (push) Has been cancelled
Security Scanning / Secrets Scanning (push) Has been cancelled
Security Scanning / Container Security Scanning (push) Has been cancelled
Security Scanning / Compliance Checking (push) Has been cancelled
Security Scanning / Security Dashboard (push) Has been cancelled
Security Scanning / Security Remediation (push) Has been cancelled
project initialization
2025-10-05 02:37:33 +08:00

297 lines
9.7 KiB
YAML

groups:
- name: system
rules:
# CPU recording rules
- record: node_cpu_usage
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
- record: node_cpu_usage_iowait
expr: avg by(instance) (irate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100
- record: node_cpu_usage_system
expr: avg by(instance) (irate(node_cpu_seconds_total{mode="system"}[5m])) * 100
- record: node_cpu_usage_user
expr: avg by(instance) (irate(node_cpu_seconds_total{mode="user"}[5m])) * 100
# Memory recording rules
- record: node_memory_usage_percent
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100
- record: node_memory_usage_bytes
expr: node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes
- record: node_memory_cached_bytes
expr: node_memory_Cached_bytes + node_memory_Buffers_bytes
- record: node_memory_swap_usage_percent
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100
# Disk recording rules
- record: node_disk_usage_percent
expr: (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)) * 100
- record: node_disk_read_iops
expr: rate(node_disk_reads_completed_total[5m])
- record: node_disk_write_iops
expr: rate(node_disk_writes_completed_total[5m])
- record: node_disk_read_bytes
expr: rate(node_disk_read_bytes_total[5m])
- record: node_disk_write_bytes
expr: rate(node_disk_written_bytes_total[5m])
# Network recording rules
- record: node_network_receive_bytes
expr: rate(node_network_receive_bytes_total[5m])
- record: node_network_transmit_bytes
expr: rate(node_network_transmit_bytes_total[5m])
- record: node_network_receive_errors
expr: rate(node_network_receive_errs_total[5m])
- record: node_network_transmit_errors
expr: rate(node_network_transmit_errs_total[5m])
- name: application
rules:
# HTTP request metrics
- record: http_requests_per_second
expr: rate(http_requests_total[5m])
- record: http_requests_5xx_per_second
expr: rate(http_requests_total{status=~"5.."}[5m])
- record: http_requests_4xx_per_second
expr: rate(http_requests_total{status=~"4.."}[5m])
- record: http_error_rate_percent
expr: (rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m])) * 100
- record: http_response_time_p50
expr: histogram_quantile(0.50, http_request_duration_seconds_bucket)
- record: http_response_time_p95
expr: histogram_quantile(0.95, http_request_duration_seconds_bucket)
- record: http_response_time_p99
expr: histogram_quantile(0.99, http_request_duration_seconds_bucket)
# Application performance
- record: application_memory_usage_bytes
expr: process_resident_memory_bytes{job="backend"}
- record: application_cpu_usage_percent
expr: rate(process_cpu_seconds_total{job="backend"}[5m]) * 100
- record: application_uptime_seconds
expr: time() - process_start_time_seconds{job="backend"}
- record: application_gc_pause_seconds_total
expr: rate(process_go_gc_duration_seconds_sum{job="backend"}[5m])
- name: database
rules:
# PostgreSQL metrics
- record: pg_stat_database_calls_total_rate
expr: rate(pg_stat_database_calls_total[5m])
- record: pg_stat_database_rows_returned_rate
expr: rate(pg_stat_database_rows_returned_total[5m])
- record: pg_stat_database_rows_fetched_rate
expr: rate(pg_stat_database_rows_fetched_total[5m])
- record: pg_stat_database_rows_inserted_rate
expr: rate(pg_stat_database_rows_inserted_total[5m])
- record: pg_stat_database_rows_updated_rate
expr: rate(pg_stat_database_rows_updated_total[5m])
- record: pg_stat_database_rows_deleted_rate
expr: rate(pg_stat_database_rows_deleted_total[5m])
- record: pg_stat_database_connections_usage_percent
expr: (pg_stat_database_numbackends / pg_settings_max_connections) * 100
- record: pg_stat_database_deadlocks_rate
expr: rate(pg_stat_database_deadlocks[5m])
- record: pg_stat_database_temp_files_rate
expr: rate(pg_stat_database_temp_files_total[5m])
- record: pg_stat_database_temp_bytes_rate
expr: rate(pg_stat_database_temp_bytes_total[5m])
# Query performance
- record: pg_stat_statements_total_time_rate
expr: rate(pg_stat_statements_total_time_ms[5m])
- record: pg_stat_statements_mean_time_ms
expr: rate(pg_stat_statements_total_time_ms[5m]) / rate(pg_stat_statements_calls[5m])
- record: pg_stat_statements_rows_per_second
expr: rate(pg_stat_statements_rows[5m])
- name: cache
rules:
# Redis metrics
- record: redis_commands_per_second
expr: rate(redis_commands_total[5m])
- record: redis_keyspace_hits_per_second
expr: rate(redis_keyspace_hits_total[5m])
- record: redis_keyspace_misses_per_second
expr: rate(redis_keyspace_misses_total[5m])
- record: redis_keyspace_hit_rate_percent
expr: (rate(redis_keyspace_hits_total[5m]) / (rate(redis_keyspace_hits_total[5m]) + rate(redis_keyspace_misses_total[5m]))) * 100
- record: redis_memory_usage_percent
expr: (redis_memory_used_bytes / redis_memory_max_bytes) * 100
- record: redis_connected_clients
expr: redis_connected_clients
- record: redis_blocked_clients
expr: redis_blocked_clients
- record: redis_connections_received_per_second
expr: rate(redis_connections_received_total[5m])
- record: redis_connections_rejected_per_second
expr: rate(redis_connections_rejected_total[5m])
- record: redis_expired_keys_per_second
expr: rate(redis_expired_keys_total[5m])
- record: redis_evicted_keys_per_second
expr: rate(redis_evicted_keys_total[5m])
- name: business
rules:
# User metrics
- record: active_users
expr: count_changes(active_users_total[5m])
- record: new_users_per_hour
expr: rate(new_users_total[1h])
- record: user_sessions_active
expr: user_sessions_total
- record: user_sessions_per_second
expr: rate(user_sessions_total[5m])
# Authentication metrics
- record: auth_successful_logins_per_second
expr: rate(auth_successful_logins_total[5m])
- record: auth_failed_logins_per_second
expr: rate(auth_failed_logins_total[5m])
- record: auth_failed_login_rate_percent
expr: (rate(auth_failed_logins_total[5m]) / (rate(auth_successful_logins_total[5m]) + rate(auth_failed_logins_total[5m]))) * 100
# Tenant metrics
- record: tenant_active_count
expr: count(tenant_active_total == 1)
- record: tenant_users_per_tenant
expr: tenant_users_total
- record: tenant_resource_usage_percent
expr: (tenant_resource_usage_bytes / tenant_resource_quota_bytes) * 100
# Business metrics
- record: transactions_per_second
expr: rate(transactions_total[5m])
- record: transaction_success_rate_percent
expr: (rate(transaction_successful_total[5m]) / rate(transactions_total[5m])) * 100
- record: payment_success_rate_percent
expr: (rate(payment_successful_total[5m]) / rate(payment_attempts_total[5m])) * 100
- record: payment_failures_per_second
expr: rate(payment_failures_total[5m])
- record: revenue_per_hour
expr: rate(revenue_total[1h])
# Malaysian-specific metrics
- record: malaysian_users_active
expr: count_changes(malaysian_users_active_total[5m])
- record: malaysian_transactions_per_second
expr: rate(malaysian_transactions_total[5m])
- record: sst_calculations_per_second
expr: rate(sst_calculations_total[5m])
- record: ic_validations_per_second
expr: rate(ic_validations_total[5m])
- record: malaysian_postcode_lookups_per_second
expr: rate(malaysian_postcode_lookups_total[5m])
- name: security
rules:
# Security events
- record: security_events_per_second
expr: rate(security_events_total[5m])
- record: blocked_requests_per_second
expr: rate(blocked_requests_total[5m])
- record: suspicious_ip_requests_per_second
expr: rate(suspicious_ip_requests_total[5m])
- record: rate_limit_exceeded_per_second
expr: rate(rate_limit_exceeded_total[5m])
# Web application firewall
- record: waf_blocked_requests_per_second
expr: rate(waf_blocked_requests_total[5m])
- record: waf_sql_injection_attempts_per_second
expr: rate(waf_sql_injection_attempts_total[5m])
- record: waf_xss_attempts_per_second
expr: rate(waf_xss_attempts_total[5m])
- record: waf_path_traversal_attempts_per_second
expr: rate(waf_path_traversal_attempts_total[5m])
# Rate limiting
- record: rate_limit_429_responses_per_second
expr: rate(http_requests_total{status="429"}[5m])
- record: rate_limit_by_ip_per_second
expr: rate(rate_limit_by_ip_total[5m])
- name: sla
rules:
# Service Level Objectives
- record: slo_http_availability_percentage
expr: (1 - (rate(http_requests_total{status=~"5.."}[1h]) / rate(http_requests_total[1h]))) * 100
- record: slo_http_latency_p95_percentage
expr: (1 - (histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[1h])) / 2)) * 100
- record: slo_database_availability_percentage
expr: (up{job="postgres"} * 100)
- record: slo_cache_availability_percentage
expr: (up{job="redis"} * 100)
# Error budget
- record: error_budget_remaining_percentage
expr: (1 - (rate(http_requests_total{status=~"5.."}[30d]) / (rate(http_requests_total[30d]) * 0.01))) * 100
- record: error_budget_burn_rate
expr: (rate(http_requests_total{status=~"5.."}[1h]) / (rate(http_requests_total[1h]) * 0.01))