name: System Monitoring on: schedule: - cron: '0 */6 * * *' # Every 6 hours workflow_dispatch: push: branches: [ main, develop ] jobs: health-checks: name: Health Checks runs-on: ubuntu-latest steps: - name: Checkout code uses: actions/checkout@v4 - name: Run production health checks run: | # API Health curl -f https://api.malaysian-sme-platform.com/health/ || exit 1 curl -f https://api.malaysian-sme-platform.com/api/health/ || exit 1 # Application Health curl -f https://app.malaysian-sme-platform.com/ || exit 1 # Database Health curl -f https://api.malaysian-sme-platform.com/api/health/database/ || exit 1 # Cache Health curl -f https://api.malaysian-sme-platform.com/api/health/cache/ || exit 1 - name: Run staging health checks run: | curl -f https://staging.malaysian-sme-platform.com/health/ || exit 1 curl -f https://staging.malaysian-sme-platform.com/api/health/ || exit 1 performance-monitoring: name: Performance Monitoring runs-on: ubuntu-latest steps: - name: Checkout code uses: actions/checkout@v4 - name: Set up k6 uses: grafana/k6-action@v0.3.0 with: filename: tests/performance/monitoring.js - name: Run performance monitoring run: | cd tests/performance k6 run monitoring.js \ --env PROD_URL=https://api.malaysian-sme-platform.com \ --env STAGING_URL=https://staging.malaysian-sme-platform.com - name: Upload performance results uses: actions/upload-artifact@v3 with: name: performance-monitoring-results path: tests/performance/results/ database-monitoring: name: Database Monitoring runs-on: ubuntu-latest steps: - name: Checkout code uses: actions/checkout@v4 - name: Set up Python uses: actions/setup-python@v4 with: python-version: 3.10 - name: Install dependencies run: | python -m pip install --upgrade pip pip install psycopg2-binary pandas matplotlib - name: Run database health checks env: DATABASE_URL: ${{ secrets.PRODUCTION_DATABASE_URL }} run: | python scripts/database-health-check.py - name: Generate database metrics report env: DATABASE_URL: ${{ secrets.PRODUCTION_DATABASE_URL }} run: | python scripts/database-metrics.py --output database-metrics.json - name: Upload database reports uses: actions/upload-artifact@v3 with: name: database-monitoring-reports path: | database-metrics.json database-health-report.json cache-monitoring: name: Cache Monitoring runs-on: ubuntu-latest steps: - name: Checkout code uses: actions/checkout@v4 - name: Set up Python uses: actions/setup-python@v4 with: python-version: 3.10 - name: Install dependencies run: | python -m pip install --upgrade pip pip install redis pandas - name: Run Redis health checks env: REDIS_URL: ${{ secrets.PRODUCTION_REDIS_URL }} run: | python scripts/redis-health-check.py - name: Generate cache metrics report env: REDIS_URL: ${{ secrets.PRODUCTION_REDIS_URL }} run: | python scripts/cache-metrics.py --output cache-metrics.json - name: Upload cache reports uses: actions/upload-artifact@v3 with: name: cache-monitoring-reports path: | cache-metrics.json redis-health-report.json log-monitoring: name: Log Monitoring runs-on: ubuntu-latest steps: - name: Checkout code uses: actions/checkout@v4 - name: Analyze application logs env: LOGS_ACCESS_KEY: ${{ secrets.LOGS_ACCESS_KEY }} run: | python scripts/log-analysis.py \ --hours 6 \ --error-threshold 10 \ --warning-threshold 50 \ --output log-analysis-report.json - name: Check for critical errors run: | python scripts/critical-error-check.py \ --hours 1 \ --notification-webhook ${{ secrets.SLACK_WEBHOOK }} - name: Upload log reports uses: actions/upload-artifact@v3 with: name: log-monitoring-reports path: | log-analysis-report.json error-summary.json resource-monitoring: name: Resource Monitoring runs-on: ubuntu-latest steps: - name: Checkout code uses: actions/checkout@v4 - name: Monitor system resources env: MONITORING_API_KEY: ${{ secrets.MONITORING_API_KEY }} run: | python scripts/resource-monitoring.py \ --output resource-metrics.json - name: Check resource thresholds run: | python scripts/resource-threshold-check.py \ --cpu-threshold 80 \ --memory-threshold 85 \ --disk-threshold 90 \ --output threshold-report.json - name: Upload resource reports uses: actions/upload-artifact@v3 with: name: resource-monitoring-reports path: | resource-metrics.json threshold-report.json uptime-monitoring: name: Uptime Monitoring runs-on: ubuntu-latest steps: - name: Checkout code uses: actions/checkout@v4 - name: Check service uptime run: | python scripts/uptime-check.py \ --services api,app,admin,static \ --timeout 30 \ --output uptime-report.json - name: Verify SSL certificates run: | python scripts/ssl-check.py \ --domains api.malaysian-sme-platform.com,app.malaysian-sme-platform.com \ --output ssl-report.json - name: Upload uptime reports uses: actions/upload-artifact@v3 with: name: uptime-monitoring-reports path: | uptime-report.json ssl-report.json backup-monitoring: name: Backup Monitoring runs-on: ubuntu-latest steps: - name: Checkout code uses: actions/checkout@v4 - name: Check backup status env: BACKUP_ACCESS_KEY: ${{ secrets.BACKUP_ACCESS_KEY }} run: | python scripts/backup-check.py \ --hours 24 \ --output backup-report.json - name: Verify backup integrity run: | python scripts/backup-integrity.py \ --verify-latest 3 \ --output integrity-report.json - name: Upload backup reports uses: actions/upload-artifact@v3 with: name: backup-monitoring-reports path: | backup-report.json integrity-report.json security-monitoring: name: Security Monitoring runs-on: ubuntu-latest steps: - name: Checkout code uses: actions/checkout@v4 - name: Check for security events env: SECURITY_API_KEY: ${{ secrets.SECURITY_API_KEY }} run: | python scripts/security-monitoring.py \ --hours 6 \ --output security-events.json - name: Analyze authentication patterns env: AUTH_LOGS_ACCESS_KEY: ${{ secrets.AUTH_LOGS_ACCESS_KEY }} run: | python scripts/auth-pattern-analysis.py \ --hours 24 \ --output auth-patterns.json - name: Upload security reports uses: actions/upload-artifact@v3 with: name: security-monitoring-reports path: | security-events.json auth-patterns.json monitoring-dashboard: name: Monitoring Dashboard runs-on: ubuntu-latest needs: [health-checks, performance-monitoring, database-monitoring, cache-monitoring, log-monitoring, resource-monitoring, uptime-monitoring, backup-monitoring, security-monitoring] if: always() steps: - name: Download all reports uses: actions/download-artifact@v3 - name: Generate monitoring dashboard run: | python scripts/generate-monitoring-dashboard.py - name: Upload monitoring dashboard uses: actions/upload-artifact@v3 with: name: monitoring-dashboard path: monitoring-dashboard.html - name: Send monitoring summary to Slack uses: 8398a7/action-slack@v3 with: status: ${{ job.status }} channel: '#monitoring' webhook_url: ${{ secrets.SLACK_WEBHOOK }} env: SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK }} alerting: name: Alerting runs-on: ubuntu-latest needs: monitoring-dashboard if: failure() steps: - name: Create alert issue uses: actions/github-script@v6 with: script: | github.rest.issues.create({ owner: context.repo.owner, repo: context.repo.repo, title: `🚨 Monitoring Alert - ${{ github.run_number }}`, body: `Monitoring checks failed for run #${{ github.run_number }}. **Time:** ${{ github.event_name }} at ${{ github.run_started_at }} **Repository:** ${{ github.repository }} Please review the monitoring reports and investigate the issues. 📋 **Monitoring Reports:** - [Health Checks](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}) - [Performance Monitoring](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}) - [Database Monitoring](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}) - [Cache Monitoring](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}) - [Log Monitoring](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}) - [Resource Monitoring](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}) - [Uptime Monitoring](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}) - [Backup Monitoring](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}) - [Security Monitoring](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}) 🎯 **Immediate Actions:** 1. Review failed monitoring checks 2. Investigate service availability 3. Check system resources 4. Verify backup integrity 5. Address any security events This issue was automatically created by the monitoring system.`, labels: ['monitoring', 'alert', 'priority-critical'] }); - name: Send emergency notification uses: 8398a7/action-slack@v3 with: status: failure channel: '#emergency' webhook_url: ${{ secrets.EMERGENCY_SLACK_WEBHOOK }} env: SLACK_WEBHOOK_URL: ${{ secrets.EMERGENCY_SLACK_WEBHOOK }}