""" Prometheus exporters for various system and application metrics. """ import time import logging import threading from typing import Dict, Any, List, Optional from datetime import datetime, timedelta from django.db import connection, connections from django.core.cache import cache from django.conf import settings from django.contrib.auth import get_user_model from django.db.models import Count, Q, Avg from django.utils import timezone from django_tenants.utils import get_tenant_model, get_tenant_schema_name from prometheus_client import Gauge, Counter, Histogram, Info, start_http_server from prometheus_client.core import GaugeMetricFamily, CounterMetricFamily, REGISTRY import psutil import redis from .middleware import ( DATABASE_QUERIES, CACHE_HITS, CACHE_MISSES, MALAYSIAN_OPERATIONS, TENANT_METRICS, BUSINESS_METRICS, ERROR_EVENTS ) logger = logging.getLogger(__name__) User = get_user_model() TenantModel = get_tenant_model() class DatabaseExporter: """Exporter for database metrics.""" def __init__(self): self.metrics = { 'database_size': Gauge( 'database_size_bytes', 'Database size in bytes', ['database', 'tenant'] ), 'database_connections': Gauge( 'database_connections_current', 'Current database connections', ['state', 'tenant'] ), 'database_transactions': Counter( 'database_transactions_total', 'Database transactions', ['type', 'tenant'] ), 'database_query_time': Histogram( 'database_query_duration_seconds', 'Database query duration', ['query_type', 'tenant'] ), 'database_deadlocks': Counter( 'database_deadlocks_total', 'Database deadlocks', ['tenant'] ), 'database_cache_hit_ratio': Gauge( 'database_cache_hit_ratio', 'Database cache hit ratio', ['tenant'] ), } def collect_metrics(self): """Collect database metrics.""" try: self._collect_database_size() self._collect_connection_metrics() self._collect_transaction_metrics() self._collect_performance_metrics() self._collect_deadlock_metrics() except Exception as e: logger.error(f"Error collecting database metrics: {e}") def _collect_database_size(self): """Collect database size metrics.""" try: with connection.cursor() as cursor: cursor.execute(""" SELECT datname, pg_database_size(datname) as size FROM pg_database WHERE datistemplate = false """) for row in cursor.fetchall(): db_name, size = row self.metrics['database_size'].labels( database=db_name, tenant='all' ).set(size) except Exception as e: logger.error(f"Error collecting database size: {e}") def _collect_connection_metrics(self): """Collect connection metrics.""" try: with connection.cursor() as cursor: # Current connections cursor.execute(""" SELECT state, COUNT(*) FROM pg_stat_activity WHERE pid <> pg_backend_pid() GROUP BY state """) for state, count in cursor.fetchall(): self.metrics['database_connections'].labels( state=state or 'idle', tenant='all' ).set(count) # Max connections cursor.execute("SHOW max_connections") max_connections = cursor.fetchone()[0] self.metrics['database_connections'].labels( state='max', tenant='all' ).set(max_connections) except Exception as e: logger.error(f"Error collecting connection metrics: {e}") def _collect_transaction_metrics(self): """Collect transaction metrics.""" try: with connection.cursor() as cursor: cursor.execute(""" SELECT datname, xact_commit, xact_rollback FROM pg_stat_database """) for db_name, commits, rollbacks in cursor.fetchall(): self.metrics['database_transactions'].labels( type='commit', tenant=db_name )._value._value.set(commits) self.metrics['database_transactions'].labels( type='rollback', tenant=db_name )._value._value.set(rollbacks) except Exception as e: logger.error(f"Error collecting transaction metrics: {e}") def _collect_performance_metrics(self): """Collect performance metrics.""" try: with connection.cursor() as cursor: # Query performance cursor.execute(""" SELECT query, calls, total_time, mean_time, rows FROM pg_stat_statements ORDER BY total_time DESC LIMIT 100 """) for query, calls, total_time, mean_time, rows in cursor.fetchall(): query_type = self._classify_query(query) self.metrics['database_query_time'].labels( query_type=query_type, tenant='all' ).observe(mean_time / 1000) # Convert to seconds # Cache hit ratio cursor.execute(""" SELECT sum(blks_hit) / (sum(blks_hit) + sum(blks_read)) as hit_ratio FROM pg_stat_database """) hit_ratio = cursor.fetchone()[0] if hit_ratio: self.metrics['database_cache_hit_ratio'].labels( tenant='all' ).set(hit_ratio * 100) except Exception as e: logger.error(f"Error collecting performance metrics: {e}") def _collect_deadlock_metrics(self): """Collect deadlock metrics.""" try: with connection.cursor() as cursor: cursor.execute(""" SELECT datname, deadlocks FROM pg_stat_database """) for db_name, deadlocks in cursor.fetchall(): if deadlocks > 0: self.metrics['database_deadlocks'].labels( tenant=db_name )._value._value.set(deadlocks) except Exception as e: logger.error(f"Error collecting deadlock metrics: {e}") def _classify_query(self, query: str) -> str: """Classify SQL query type.""" query_upper = query.upper() if query_upper.startswith('SELECT'): return 'select' elif query_upper.startswith('INSERT'): return 'insert' elif query_upper.startswith('UPDATE'): return 'update' elif query_upper.startswith('DELETE'): return 'delete' elif query_upper.startswith('CREATE'): return 'ddl' elif query_upper.startswith('ALTER'): return 'ddl' elif query_upper.startswith('DROP'): return 'ddl' else: return 'other' class CacheExporter: """Exporter for cache metrics.""" def __init__(self): self.metrics = { 'cache_size': Gauge( 'cache_size_bytes', 'Cache size in bytes', ['cache_type', 'tenant'] ), 'cache_items': Gauge( 'cache_items_total', 'Total items in cache', ['cache_type', 'tenant'] ), 'cache_operations': Counter( 'cache_operations_total', 'Cache operations', ['operation', 'cache_type', 'tenant'] ), 'cache_hit_ratio': Gauge( 'cache_hit_ratio_percent', 'Cache hit ratio percentage', ['cache_type', 'tenant'] ), 'cache_evictions': Counter( 'cache_evictions_total', 'Cache evictions', ['cache_type', 'tenant'] ), 'cache_memory_usage': Gauge( 'cache_memory_usage_bytes', 'Cache memory usage', ['cache_type', 'tenant'] ), } def collect_metrics(self): """Collect cache metrics.""" try: self._collect_redis_metrics() self._collect_django_cache_metrics() except Exception as e: logger.error(f"Error collecting cache metrics: {e}") def _collect_redis_metrics(self): """Collect Redis metrics.""" try: redis_client = redis.Redis.from_url(settings.REDIS_URL) info = redis_client.info() # Memory usage self.metrics['cache_memory_usage'].labels( cache_type='redis', tenant='all' ).set(info['used_memory']) # Key count self.metrics['cache_items'].labels( cache_type='redis', tenant='all' ).set(info['keyspace_hits'] + info['keyspace_misses']) # Hit ratio total = info['keyspace_hits'] + info['keyspace_misses'] if total > 0: hit_ratio = (info['keyspace_hits'] / total) * 100 self.metrics['cache_hit_ratio'].labels( cache_type='redis', tenant='all' ).set(hit_ratio) # Operations self.metrics['cache_operations'].labels( operation='get', cache_type='redis', tenant='all' )._value._value.set(info['keyspace_hits'] + info['keyspace_misses']) except Exception as e: logger.error(f"Error collecting Redis metrics: {e}") def _collect_django_cache_metrics(self): """Collect Django cache metrics.""" try: # Get Django cache stats cache_stats = cache.get_stats() for backend_name, stats in cache_stats.items(): if 'hits' in stats and 'misses' in stats: total = stats['hits'] + stats['misses'] if total > 0: hit_ratio = (stats['hits'] / total) * 100 self.metrics['cache_hit_ratio'].labels( cache_type='django', tenant='all' ).set(hit_ratio) self.metrics['cache_operations'].labels( operation='get', cache_type='django', tenant='all' )._value._value.set(total) except Exception as e: logger.error(f"Error collecting Django cache metrics: {e}") class SystemExporter: """Exporter for system metrics.""" def __init__(self): self.metrics = { 'system_cpu_usage': Gauge( 'system_cpu_usage_percent', 'System CPU usage percentage' ), 'system_memory_usage': Gauge( 'system_memory_usage_bytes', 'System memory usage' ), 'system_memory_usage_percent': Gauge( 'system_memory_usage_percent', 'System memory usage percentage' ), 'system_disk_usage': Gauge( 'system_disk_usage_bytes', 'System disk usage', ['device', 'mountpoint'] ), 'system_disk_usage_percent': Gauge( 'system_disk_usage_percent', 'System disk usage percentage', ['device', 'mountpoint'] ), 'system_network_bytes': Counter( 'system_network_bytes_total', 'System network traffic', ['direction', 'interface'] ), 'system_load_average': Gauge( 'system_load_average', 'System load average', ['period'] ), 'system_uptime': Gauge( 'system_uptime_seconds', 'System uptime in seconds' ), } def collect_metrics(self): """Collect system metrics.""" try: self._collect_cpu_metrics() self._collect_memory_metrics() self._collect_disk_metrics() self._collect_network_metrics() self._collect_load_metrics() except Exception as e: logger.error(f"Error collecting system metrics: {e}") def _collect_cpu_metrics(self): """Collect CPU metrics.""" try: cpu_percent = psutil.cpu_percent(interval=1) self.metrics['system_cpu_usage'].set(cpu_percent) # Per-CPU usage cpu_times = psutil.cpu_times_percent(interval=1) for i, (cpu_id, percent) in enumerate(psutil.cpu_percent(interval=1, percpu=True)): self.metrics['system_cpu_usage'].labels(cpu=f'cpu_{i}').set(percent) except Exception as e: logger.error(f"Error collecting CPU metrics: {e}") def _collect_memory_metrics(self): """Collect memory metrics.""" try: memory = psutil.virtual_memory() self.metrics['system_memory_usage'].set(memory.used) self.metrics['system_memory_usage_percent'].set(memory.percent) # Swap memory swap = psutil.swap_memory() self.metrics['system_memory_usage'].labels(type='swap').set(swap.used) self.metrics['system_memory_usage_percent'].labels(type='swap').set(swap.percent) except Exception as e: logger.error(f"Error collecting memory metrics: {e}") def _collect_disk_metrics(self): """Collect disk metrics.""" try: disk_usage = psutil.disk_usage('/') self.metrics['system_disk_usage'].labels( device='root', mountpoint='/' ).set(disk_usage.used) self.metrics['system_disk_usage_percent'].labels( device='root', mountpoint='/' ).set((disk_usage.used / disk_usage.total) * 100) # Disk I/O disk_io = psutil.disk_io_counters() if disk_io: self.metrics['system_network_bytes'].labels( direction='read', interface='disk' )._value._value.set(disk_io.read_bytes) self.metrics['system_network_bytes'].labels( direction='write', interface='disk' )._value._value.set(disk_io.write_bytes) except Exception as e: logger.error(f"Error collecting disk metrics: {e}") def _collect_network_metrics(self): """Collect network metrics.""" try: net_io = psutil.net_io_counters() if net_io: self.metrics['system_network_bytes'].labels( direction='recv', interface='all' )._value._value.set(net_io.bytes_recv) self.metrics['system_network_bytes'].labels( direction='sent', interface='all' )._value._value.set(net_io.bytes_sent) except Exception as e: logger.error(f"Error collecting network metrics: {e}") def _collect_load_metrics(self): """Collect load average metrics.""" try: load_avg = psutil.getloadavg() self.metrics['system_load_average'].labels(period='1min').set(load_avg[0]) self.metrics['system_load_average'].labels(period='5min').set(load_avg[1]) self.metrics['system_load_average'].labels(period='15min').set(load_avg[2]) # System uptime self.metrics['system_uptime'].set(time.time() - psutil.boot_time()) except Exception as e: logger.error(f"Error collecting load metrics: {e}") class BusinessExporter: """Exporter for business metrics.""" def __init__(self): self.metrics = { 'active_users': Gauge( 'business_active_users', 'Number of active users', ['tenant', 'industry_type'] ), 'user_registrations': Counter( 'business_user_registrations_total', 'User registrations', ['tenant', 'period'] ), 'revenue': Counter( 'business_revenue_myr_total', 'Revenue in Malaysian Ringgit', ['tenant', 'industry_type'] ), 'transactions': Counter( 'business_transactions_total', 'Business transactions', ['status', 'tenant', 'payment_method'] ), 'tenant_resource_usage': Gauge( 'business_tenant_resource_usage_percent', 'Tenant resource usage percentage', ['tenant', 'resource_type'] ), 'malaysian_specific': Counter( 'business_malaysian_operations_total', 'Malaysian-specific operations', ['operation', 'state', 'tenant'] ), } def collect_metrics(self): """Collect business metrics.""" try: self._collect_user_metrics() self._collect_revenue_metrics() self._collect_transaction_metrics() self._collect_tenant_metrics() self._collect_malaysian_metrics() except Exception as e: logger.error(f"Error collecting business metrics: {e}") def _collect_user_metrics(self): """Collect user metrics.""" try: # Active users (last 5 minutes) five_minutes_ago = timezone.now() - timedelta(minutes=5) active_count = User.objects.filter( last_login__gte=five_minutes_ago, is_active=True ).count() self.metrics['active_users'].labels( tenant='all', industry_type='all' ).set(active_count) # User registrations by period today = timezone.now().date() week_ago = today - timedelta(days=7) month_ago = today - timedelta(days=30) registrations_today = User.objects.filter( date_joined__date=today ).count() registrations_week = User.objects.filter( date_joined__date__gte=week_ago ).count() registrations_month = User.objects.filter( date_joined__date__gte=month_ago ).count() self.metrics['user_registrations'].labels( tenant='all', period='today' )._value._value.set(registrations_today) self.metrics['user_registrations'].labels( tenant='all', period='week' )._value._value.set(registrations_week) self.metrics['user_registrations'].labels( tenant='all', period='month' )._value._value.set(registrations_month) except Exception as e: logger.error(f"Error collecting user metrics: {e}") def _collect_revenue_metrics(self): """Collect revenue metrics.""" try: # This would integrate with your payment system # For now, we'll use placeholder values from core.models import Transaction today = timezone.now().date() today_revenue = Transaction.objects.filter( created_at__date=today, status='completed' ).aggregate(total=Sum('amount'))['total'] or 0 self.metrics['revenue'].labels( tenant='all', industry_type='all' )._value._value.set(today_revenue) except Exception as e: logger.error(f"Error collecting revenue metrics: {e}") def _collect_transaction_metrics(self): """Collect transaction metrics.""" try: from core.models import Transaction # Transaction counts by status status_counts = Transaction.objects.values('status').annotate( count=Count('id') ) for item in status_counts: self.metrics['transactions'].labels( status=item['status'], tenant='all', payment_method='all' )._value._value.set(item['count']) except Exception as e: logger.error(f"Error collecting transaction metrics: {e}") def _collect_tenant_metrics(self): """Collect tenant metrics.""" try: tenants = TenantModel.objects.all() for tenant in tenants: # Tenant resource usage (placeholder) self.metrics['tenant_resource_usage'].labels( tenant=tenant.name, resource_type='storage' ).set(50) # Placeholder value # Tenant active users active_users = User.objects.filter( tenant=tenant, is_active=True, last_login__gte=timezone.now() - timedelta(minutes=30) ).count() self.metrics['active_users'].labels( tenant=tenant.name, industry_type=getattr(tenant, 'industry_type', 'general') ).set(active_users) except Exception as e: logger.error(f"Error collecting tenant metrics: {e}") def _collect_malaysian_metrics(self): """Collect Malaysian-specific metrics.""" try: from core.models import MalaysianICValidation, SSTCalculation # IC validations by state ic_validations = MalaysianICValidation.objects.values( 'state' ).annotate(count=Count('id')) for item in ic_validations: self.metrics['malaysian_specific'].labels( operation='ic_validation', state=item['state'], tenant='all' )._value._value.set(item['count']) # SST calculations sst_calculations = SSTCalculation.objects.count() self.metrics['malaysian_specific'].labels( operation='sst_calculation', state='all', tenant='all' )._value._value.set(sst_calculations) except Exception as e: logger.error(f"Error collecting Malaysian metrics: {e}") class MetricsCollector: """Main metrics collector that runs all exporters.""" def __init__(self): self.exporters = { 'database': DatabaseExporter(), 'cache': CacheExporter(), 'system': SystemExporter(), 'business': BusinessExporter(), } self.running = False self.thread = None def start_collection(self, interval: int = 30): """Start metrics collection in background thread.""" if not self.running: self.running = True self.thread = threading.Thread(target=self._collect_loop, args=(interval,)) self.thread.daemon = True self.thread.start() logger.info("Metrics collection started") def stop_collection(self): """Stop metrics collection.""" self.running = False if self.thread: self.thread.join() logger.info("Metrics collection stopped") def _collect_loop(self, interval: int): """Main collection loop.""" while self.running: try: for name, exporter in self.exporters.items(): logger.debug(f"Collecting {name} metrics...") exporter.collect_metrics() time.sleep(interval) except Exception as e: logger.error(f"Error in metrics collection loop: {e}") time.sleep(interval) def collect_once(self): """Collect metrics once (for testing).""" for name, exporter in self.exporters.items(): try: logger.debug(f"Collecting {name} metrics...") exporter.collect_metrics() except Exception as e: logger.error(f"Error collecting {name} metrics: {e}") # Global metrics collector instance metrics_collector = MetricsCollector()