multitenetsaas/backend/monitoring/exporters.py

"""
Prometheus exporters for various system and application metrics.
"""

import time
import logging
import threading
from typing import Dict, Any, List, Optional
from datetime import datetime, timedelta
from django.db import connection, connections
from django.core.cache import cache
from django.conf import settings
from django.contrib.auth import get_user_model
from django.db.models import Count, Q, Avg
from django.utils import timezone
from django_tenants.utils import get_tenant_model, get_tenant_schema_name
from prometheus_client import Gauge, Counter, Histogram, Info, start_http_server
from prometheus_client.core import GaugeMetricFamily, CounterMetricFamily, REGISTRY
import psutil
import redis

from .middleware import (
    DATABASE_QUERIES, CACHE_HITS, CACHE_MISSES, MALAYSIAN_OPERATIONS,
    TENANT_METRICS, BUSINESS_METRICS, ERROR_EVENTS
)

logger = logging.getLogger(__name__)
User = get_user_model()
TenantModel = get_tenant_model()

class DatabaseExporter:
    """Exporter for database metrics."""

    def __init__(self):
        self.metrics = {
            'database_size': Gauge(
                'database_size_bytes',
                'Database size in bytes',
                ['database', 'tenant']
            ),
            'database_connections': Gauge(
                'database_connections_current',
                'Current database connections',
                ['state', 'tenant']
            ),
            'database_transactions': Counter(
                'database_transactions_total',
                'Database transactions',
                ['type', 'tenant']
            ),
            'database_query_time': Histogram(
                'database_query_duration_seconds',
                'Database query duration',
                ['query_type', 'tenant']
            ),
            'database_deadlocks': Counter(
                'database_deadlocks_total',
                'Database deadlocks',
                ['tenant']
            ),
            'database_cache_hit_ratio': Gauge(
                'database_cache_hit_ratio',
                'Database cache hit ratio',
                ['tenant']
            ),
        }

    def collect_metrics(self):
        """Collect database metrics."""
        try:
            self._collect_database_size()
            self._collect_connection_metrics()
            self._collect_transaction_metrics()
            self._collect_performance_metrics()
            self._collect_deadlock_metrics()
        except Exception as e:
            logger.error(f"Error collecting database metrics: {e}")

    def _collect_database_size(self):
        """Collect database size metrics."""
        try:
            with connection.cursor() as cursor:
                cursor.execute("""
                    SELECT datname, pg_database_size(datname) as size
                    FROM pg_database
                    WHERE datistemplate = false
                """)
                for row in cursor.fetchall():
                    db_name, size = row
                    self.metrics['database_size'].labels(
                        database=db_name,
                        tenant='all'
                    ).set(size)
        except Exception as e:
            logger.error(f"Error collecting database size: {e}")

    def _collect_connection_metrics(self):
        """Collect connection metrics."""
        try:
            with connection.cursor() as cursor:
                # Current connections
                cursor.execute("""
                    SELECT state, COUNT(*)
                    FROM pg_stat_activity
                    WHERE pid <> pg_backend_pid()
                    GROUP BY state
                """)
                for state, count in cursor.fetchall():
                    self.metrics['database_connections'].labels(
                        state=state or 'idle',
                        tenant='all'
                    ).set(count)

                # Max connections
                cursor.execute("SHOW max_connections")
                max_connections = cursor.fetchone()[0]
                self.metrics['database_connections'].labels(
                    state='max',
                    tenant='all'
                ).set(max_connections)
        except Exception as e:
            logger.error(f"Error collecting connection metrics: {e}")

    def _collect_transaction_metrics(self):
        """Collect transaction metrics."""
        try:
            with connection.cursor() as cursor:
                cursor.execute("""
                    SELECT datname, xact_commit, xact_rollback
                    FROM pg_stat_database
                """)
                for db_name, commits, rollbacks in cursor.fetchall():
                    self.metrics['database_transactions'].labels(
                        type='commit',
                        tenant=db_name
                    )._value._value.set(commits)
                    self.metrics['database_transactions'].labels(
                        type='rollback',
                        tenant=db_name
                    )._value._value.set(rollbacks)
        except Exception as e:
            logger.error(f"Error collecting transaction metrics: {e}")

    def _collect_performance_metrics(self):
        """Collect performance metrics."""
        try:
            with connection.cursor() as cursor:
                # Query performance
                cursor.execute("""
                    SELECT query, calls, total_time, mean_time, rows
                    FROM pg_stat_statements
                    ORDER BY total_time DESC
                    LIMIT 100
                """)
                for query, calls, total_time, mean_time, rows in cursor.fetchall():
                    query_type = self._classify_query(query)
                    self.metrics['database_query_time'].labels(
                        query_type=query_type,
                        tenant='all'
                    ).observe(mean_time / 1000)  # Convert to seconds

                # Cache hit ratio
                cursor.execute("""
                    SELECT sum(blks_hit) / (sum(blks_hit) + sum(blks_read)) as hit_ratio
                    FROM pg_stat_database
                """)
                hit_ratio = cursor.fetchone()[0]
                if hit_ratio:
                    self.metrics['database_cache_hit_ratio'].labels(
                        tenant='all'
                    ).set(hit_ratio * 100)
        except Exception as e:
            logger.error(f"Error collecting performance metrics: {e}")

    def _collect_deadlock_metrics(self):
        """Collect deadlock metrics."""
        try:
            with connection.cursor() as cursor:
                cursor.execute("""
                    SELECT datname, deadlocks
                    FROM pg_stat_database
                """)
                for db_name, deadlocks in cursor.fetchall():
                    if deadlocks > 0:
                        self.metrics['database_deadlocks'].labels(
                            tenant=db_name
                        )._value._value.set(deadlocks)
        except Exception as e:
            logger.error(f"Error collecting deadlock metrics: {e}")

    def _classify_query(self, query: str) -> str:
        """Classify SQL query type."""
        query_upper = query.upper()
        if query_upper.startswith('SELECT'):
            return 'select'
        elif query_upper.startswith('INSERT'):
            return 'insert'
        elif query_upper.startswith('UPDATE'):
            return 'update'
        elif query_upper.startswith('DELETE'):
            return 'delete'
        elif query_upper.startswith('CREATE'):
            return 'ddl'
        elif query_upper.startswith('ALTER'):
            return 'ddl'
        elif query_upper.startswith('DROP'):
            return 'ddl'
        else:
            return 'other'


class CacheExporter:
    """Exporter for cache metrics."""

    def __init__(self):
        self.metrics = {
            'cache_size': Gauge(
                'cache_size_bytes',
                'Cache size in bytes',
                ['cache_type', 'tenant']
            ),
            'cache_items': Gauge(
                'cache_items_total',
                'Total items in cache',
                ['cache_type', 'tenant']
            ),
            'cache_operations': Counter(
                'cache_operations_total',
                'Cache operations',
                ['operation', 'cache_type', 'tenant']
            ),
            'cache_hit_ratio': Gauge(
                'cache_hit_ratio_percent',
                'Cache hit ratio percentage',
                ['cache_type', 'tenant']
            ),
            'cache_evictions': Counter(
                'cache_evictions_total',
                'Cache evictions',
                ['cache_type', 'tenant']
            ),
            'cache_memory_usage': Gauge(
                'cache_memory_usage_bytes',
                'Cache memory usage',
                ['cache_type', 'tenant']
            ),
        }

    def collect_metrics(self):
        """Collect cache metrics."""
        try:
            self._collect_redis_metrics()
            self._collect_django_cache_metrics()
        except Exception as e:
            logger.error(f"Error collecting cache metrics: {e}")

    def _collect_redis_metrics(self):
        """Collect Redis metrics."""
        try:
            redis_client = redis.Redis.from_url(settings.REDIS_URL)
            info = redis_client.info()

            # Memory usage
            self.metrics['cache_memory_usage'].labels(
                cache_type='redis',
                tenant='all'
            ).set(info['used_memory'])

            # Key count
            self.metrics['cache_items'].labels(
                cache_type='redis',
                tenant='all'
            ).set(info['keyspace_hits'] + info['keyspace_misses'])

            # Hit ratio
            total = info['keyspace_hits'] + info['keyspace_misses']
            if total > 0:
                hit_ratio = (info['keyspace_hits'] / total) * 100
                self.metrics['cache_hit_ratio'].labels(
                    cache_type='redis',
                    tenant='all'
                ).set(hit_ratio)

            # Operations
            self.metrics['cache_operations'].labels(
                operation='get',
                cache_type='redis',
                tenant='all'
            )._value._value.set(info['keyspace_hits'] + info['keyspace_misses'])

        except Exception as e:
            logger.error(f"Error collecting Redis metrics: {e}")

    def _collect_django_cache_metrics(self):
        """Collect Django cache metrics."""
        try:
            # Get Django cache stats
            cache_stats = cache.get_stats()

            for backend_name, stats in cache_stats.items():
                if 'hits' in stats and 'misses' in stats:
                    total = stats['hits'] + stats['misses']
                    if total > 0:
                        hit_ratio = (stats['hits'] / total) * 100
                        self.metrics['cache_hit_ratio'].labels(
                            cache_type='django',
                            tenant='all'
                        ).set(hit_ratio)

                    self.metrics['cache_operations'].labels(
                        operation='get',
                        cache_type='django',
                        tenant='all'
                    )._value._value.set(total)

        except Exception as e:
            logger.error(f"Error collecting Django cache metrics: {e}")


class SystemExporter:
    """Exporter for system metrics."""

    def __init__(self):
        self.metrics = {
            'system_cpu_usage': Gauge(
                'system_cpu_usage_percent',
                'System CPU usage percentage'
            ),
            'system_memory_usage': Gauge(
                'system_memory_usage_bytes',
                'System memory usage'
            ),
            'system_memory_usage_percent': Gauge(
                'system_memory_usage_percent',
                'System memory usage percentage'
            ),
            'system_disk_usage': Gauge(
                'system_disk_usage_bytes',
                'System disk usage',
                ['device', 'mountpoint']
            ),
            'system_disk_usage_percent': Gauge(
                'system_disk_usage_percent',
                'System disk usage percentage',
                ['device', 'mountpoint']
            ),
            'system_network_bytes': Counter(
                'system_network_bytes_total',
                'System network traffic',
                ['direction', 'interface']
            ),
            'system_load_average': Gauge(
                'system_load_average',
                'System load average',
                ['period']
            ),
            'system_uptime': Gauge(
                'system_uptime_seconds',
                'System uptime in seconds'
            ),
        }

    def collect_metrics(self):
        """Collect system metrics."""
        try:
            self._collect_cpu_metrics()
            self._collect_memory_metrics()
            self._collect_disk_metrics()
            self._collect_network_metrics()
            self._collect_load_metrics()
        except Exception as e:
            logger.error(f"Error collecting system metrics: {e}")

    def _collect_cpu_metrics(self):
        """Collect CPU metrics."""
        try:
            cpu_percent = psutil.cpu_percent(interval=1)
            self.metrics['system_cpu_usage'].set(cpu_percent)

            # Per-CPU usage
            cpu_times = psutil.cpu_times_percent(interval=1)
            for i, (cpu_id, percent) in enumerate(psutil.cpu_percent(interval=1, percpu=True)):
                self.metrics['system_cpu_usage'].labels(cpu=f'cpu_{i}').set(percent)

        except Exception as e:
            logger.error(f"Error collecting CPU metrics: {e}")

    def _collect_memory_metrics(self):
        """Collect memory metrics."""
        try:
            memory = psutil.virtual_memory()
            self.metrics['system_memory_usage'].set(memory.used)
            self.metrics['system_memory_usage_percent'].set(memory.percent)

            # Swap memory
            swap = psutil.swap_memory()
            self.metrics['system_memory_usage'].labels(type='swap').set(swap.used)
            self.metrics['system_memory_usage_percent'].labels(type='swap').set(swap.percent)

        except Exception as e:
            logger.error(f"Error collecting memory metrics: {e}")

    def _collect_disk_metrics(self):
        """Collect disk metrics."""
        try:
            disk_usage = psutil.disk_usage('/')
            self.metrics['system_disk_usage'].labels(
                device='root',
                mountpoint='/'
            ).set(disk_usage.used)
            self.metrics['system_disk_usage_percent'].labels(
                device='root',
                mountpoint='/'
            ).set((disk_usage.used / disk_usage.total) * 100)

            # Disk I/O
            disk_io = psutil.disk_io_counters()
            if disk_io:
                self.metrics['system_network_bytes'].labels(
                    direction='read',
                    interface='disk'
                )._value._value.set(disk_io.read_bytes)
                self.metrics['system_network_bytes'].labels(
                    direction='write',
                    interface='disk'
                )._value._value.set(disk_io.write_bytes)

        except Exception as e:
            logger.error(f"Error collecting disk metrics: {e}")

    def _collect_network_metrics(self):
        """Collect network metrics."""
        try:
            net_io = psutil.net_io_counters()
            if net_io:
                self.metrics['system_network_bytes'].labels(
                    direction='recv',
                    interface='all'
                )._value._value.set(net_io.bytes_recv)
                self.metrics['system_network_bytes'].labels(
                    direction='sent',
                    interface='all'
                )._value._value.set(net_io.bytes_sent)

        except Exception as e:
            logger.error(f"Error collecting network metrics: {e}")

    def _collect_load_metrics(self):
        """Collect load average metrics."""
        try:
            load_avg = psutil.getloadavg()
            self.metrics['system_load_average'].labels(period='1min').set(load_avg[0])
            self.metrics['system_load_average'].labels(period='5min').set(load_avg[1])
            self.metrics['system_load_average'].labels(period='15min').set(load_avg[2])

            # System uptime
            self.metrics['system_uptime'].set(time.time() - psutil.boot_time())

        except Exception as e:
            logger.error(f"Error collecting load metrics: {e}")


class BusinessExporter:
    """Exporter for business metrics."""

    def __init__(self):
        self.metrics = {
            'active_users': Gauge(
                'business_active_users',
                'Number of active users',
                ['tenant', 'industry_type']
            ),
            'user_registrations': Counter(
                'business_user_registrations_total',
                'User registrations',
                ['tenant', 'period']
            ),
            'revenue': Counter(
                'business_revenue_myr_total',
                'Revenue in Malaysian Ringgit',
                ['tenant', 'industry_type']
            ),
            'transactions': Counter(
                'business_transactions_total',
                'Business transactions',
                ['status', 'tenant', 'payment_method']
            ),
            'tenant_resource_usage': Gauge(
                'business_tenant_resource_usage_percent',
                'Tenant resource usage percentage',
                ['tenant', 'resource_type']
            ),
            'malaysian_specific': Counter(
                'business_malaysian_operations_total',
                'Malaysian-specific operations',
                ['operation', 'state', 'tenant']
            ),
        }

    def collect_metrics(self):
        """Collect business metrics."""
        try:
            self._collect_user_metrics()
            self._collect_revenue_metrics()
            self._collect_transaction_metrics()
            self._collect_tenant_metrics()
            self._collect_malaysian_metrics()
        except Exception as e:
            logger.error(f"Error collecting business metrics: {e}")

    def _collect_user_metrics(self):
        """Collect user metrics."""
        try:
            # Active users (last 5 minutes)
            five_minutes_ago = timezone.now() - timedelta(minutes=5)
            active_count = User.objects.filter(
                last_login__gte=five_minutes_ago,
                is_active=True
            ).count()

            self.metrics['active_users'].labels(
                tenant='all',
                industry_type='all'
            ).set(active_count)

            # User registrations by period
            today = timezone.now().date()
            week_ago = today - timedelta(days=7)
            month_ago = today - timedelta(days=30)

            registrations_today = User.objects.filter(
                date_joined__date=today
            ).count()

            registrations_week = User.objects.filter(
                date_joined__date__gte=week_ago
            ).count()

            registrations_month = User.objects.filter(
                date_joined__date__gte=month_ago
            ).count()

            self.metrics['user_registrations'].labels(
                tenant='all',
                period='today'
            )._value._value.set(registrations_today)
            self.metrics['user_registrations'].labels(
                tenant='all',
                period='week'
            )._value._value.set(registrations_week)
            self.metrics['user_registrations'].labels(
                tenant='all',
                period='month'
            )._value._value.set(registrations_month)

        except Exception as e:
            logger.error(f"Error collecting user metrics: {e}")

    def _collect_revenue_metrics(self):
        """Collect revenue metrics."""
        try:
            # This would integrate with your payment system
            # For now, we'll use placeholder values
            from core.models import Transaction

            today = timezone.now().date()
            today_revenue = Transaction.objects.filter(
                created_at__date=today,
                status='completed'
            ).aggregate(total=Sum('amount'))['total'] or 0

            self.metrics['revenue'].labels(
                tenant='all',
                industry_type='all'
            )._value._value.set(today_revenue)

        except Exception as e:
            logger.error(f"Error collecting revenue metrics: {e}")

    def _collect_transaction_metrics(self):
        """Collect transaction metrics."""
        try:
            from core.models import Transaction

            # Transaction counts by status
            status_counts = Transaction.objects.values('status').annotate(
                count=Count('id')
            )

            for item in status_counts:
                self.metrics['transactions'].labels(
                    status=item['status'],
                    tenant='all',
                    payment_method='all'
                )._value._value.set(item['count'])

        except Exception as e:
            logger.error(f"Error collecting transaction metrics: {e}")

    def _collect_tenant_metrics(self):
        """Collect tenant metrics."""
        try:
            tenants = TenantModel.objects.all()

            for tenant in tenants:
                # Tenant resource usage (placeholder)
                self.metrics['tenant_resource_usage'].labels(
                    tenant=tenant.name,
                    resource_type='storage'
                ).set(50)  # Placeholder value

                # Tenant active users
                active_users = User.objects.filter(
                    tenant=tenant,
                    is_active=True,
                    last_login__gte=timezone.now() - timedelta(minutes=30)
                ).count()

                self.metrics['active_users'].labels(
                    tenant=tenant.name,
                    industry_type=getattr(tenant, 'industry_type', 'general')
                ).set(active_users)

        except Exception as e:
            logger.error(f"Error collecting tenant metrics: {e}")

    def _collect_malaysian_metrics(self):
        """Collect Malaysian-specific metrics."""
        try:
            from core.models import MalaysianICValidation, SSTCalculation

            # IC validations by state
            ic_validations = MalaysianICValidation.objects.values(
                'state'
            ).annotate(count=Count('id'))

            for item in ic_validations:
                self.metrics['malaysian_specific'].labels(
                    operation='ic_validation',
                    state=item['state'],
                    tenant='all'
                )._value._value.set(item['count'])

            # SST calculations
            sst_calculations = SSTCalculation.objects.count()
            self.metrics['malaysian_specific'].labels(
                operation='sst_calculation',
                state='all',
                tenant='all'
            )._value._value.set(sst_calculations)

        except Exception as e:
            logger.error(f"Error collecting Malaysian metrics: {e}")


class MetricsCollector:
    """Main metrics collector that runs all exporters."""

    def __init__(self):
        self.exporters = {
            'database': DatabaseExporter(),
            'cache': CacheExporter(),
            'system': SystemExporter(),
            'business': BusinessExporter(),
        }
        self.running = False
        self.thread = None

    def start_collection(self, interval: int = 30):
        """Start metrics collection in background thread."""
        if not self.running:
            self.running = True
            self.thread = threading.Thread(target=self._collect_loop, args=(interval,))
            self.thread.daemon = True
            self.thread.start()
            logger.info("Metrics collection started")

    def stop_collection(self):
        """Stop metrics collection."""
        self.running = False
        if self.thread:
            self.thread.join()
        logger.info("Metrics collection stopped")

    def _collect_loop(self, interval: int):
        """Main collection loop."""
        while self.running:
            try:
                for name, exporter in self.exporters.items():
                    logger.debug(f"Collecting {name} metrics...")
                    exporter.collect_metrics()

                time.sleep(interval)
            except Exception as e:
                logger.error(f"Error in metrics collection loop: {e}")
                time.sleep(interval)

    def collect_once(self):
        """Collect metrics once (for testing)."""
        for name, exporter in self.exporters.items():
            try:
                logger.debug(f"Collecting {name} metrics...")
                exporter.collect_metrics()
            except Exception as e:
                logger.error(f"Error collecting {name} metrics: {e}")


# Global metrics collector instance
metrics_collector = MetricsCollector()