Monitoring and Health Checks #7125
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Monitoring and Health Checks | |
| on: | |
| schedule: | |
| # Run health checks every 5 minutes | |
| - cron: "*/5 * * * *" | |
| workflow_dispatch: | |
| inputs: | |
| environment: | |
| description: "Environment to monitor" | |
| required: true | |
| default: "production" | |
| type: choice | |
| options: | |
| - production | |
| - staging | |
| jobs: | |
| # Health check production | |
| health-check-production: | |
| runs-on: ubuntu-latest | |
| if: github.event.inputs.environment == 'production' || github.event.schedule | |
| steps: | |
| - name: Check production health | |
| run: | | |
| PRODUCTION_URL="${{ secrets.PRODUCTION_URL }}" | |
| # Basic health check | |
| HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" "$PRODUCTION_URL/api/health") | |
| if [ "$HTTP_STATUS" != "200" ]; then | |
| echo "❌ Production health check failed (HTTP $HTTP_STATUS)" | |
| exit 1 | |
| fi | |
| # Response time check | |
| RESPONSE_TIME=$(curl -s -o /dev/null -w "%{time_total}" "$PRODUCTION_URL/api/health") | |
| RESPONSE_TIME_MS=$(echo "$RESPONSE_TIME * 1000" | bc) | |
| if (( $(echo "$RESPONSE_TIME_MS > 5000" | bc -l) )); then | |
| echo "⚠️ Production response time is slow: ${RESPONSE_TIME_MS}ms" | |
| else | |
| echo "✅ Production health check passed (${RESPONSE_TIME_MS}ms)" | |
| fi | |
| # Database connectivity check | |
| curl -f "$PRODUCTION_URL/api/health/database" || { | |
| echo "❌ Database connectivity check failed" | |
| exit 1 | |
| } | |
| # Redis connectivity check | |
| curl -f "$PRODUCTION_URL/api/health/redis" || { | |
| echo "❌ Redis connectivity check failed" | |
| exit 1 | |
| } | |
| - name: Alert on failure | |
| if: failure() | |
| run: | | |
| echo "🚨 Production Health Check Failed!" | |
| echo "Environment: Production" | |
| echo "Time: $(date)" | |
| echo "Branch: ${{ github.ref_name }}" | |
| echo "Please check the production environment immediately!" | |
| # Health check staging | |
| health-check-staging: | |
| runs-on: ubuntu-latest | |
| if: github.event.inputs.environment == 'staging' | |
| steps: | |
| - name: Check staging health | |
| run: | | |
| STAGING_URL="${{ secrets.STAGING_URL }}" | |
| # Basic health check | |
| HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" "$STAGING_URL/api/health") | |
| if [ "$HTTP_STATUS" != "200" ]; then | |
| echo "❌ Staging health check failed (HTTP $HTTP_STATUS)" | |
| exit 1 | |
| fi | |
| echo "✅ Staging health check passed" | |
| # Performance monitoring | |
| performance-monitoring: | |
| runs-on: ubuntu-latest | |
| if: github.event.schedule | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Set up Node.js | |
| uses: actions/setup-node@v4 | |
| with: | |
| node-version: "20" | |
| cache: "npm" | |
| - name: Install dependencies | |
| run: npm ci | |
| - name: Run performance tests | |
| run: | | |
| # Run basic performance tests | |
| npm run test:performance || echo "Performance tests not configured" | |
| - name: Check application metrics | |
| run: | | |
| PRODUCTION_URL="${{ secrets.PRODUCTION_URL }}" | |
| # Get application metrics | |
| METRICS=$(curl -s "$PRODUCTION_URL/api/metrics" || echo "{}") | |
| # Parse and check key metrics | |
| echo "Application metrics:" | |
| echo "$METRICS" | |
| # Check if metrics are within acceptable ranges | |
| # Add your specific metric checks here | |
| # Security monitoring | |
| security-monitoring: | |
| runs-on: ubuntu-latest | |
| if: github.event.schedule | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Run security audit | |
| run: | | |
| npm audit --audit-level=moderate | |
| - name: Check for known vulnerabilities | |
| run: | | |
| npm audit --audit-level=high --json > audit-results.json | |
| # Check if there are high or critical vulnerabilities | |
| HIGH_VULNS=$(cat audit-results.json | jq '.metadata.vulnerabilities.high // 0') | |
| CRITICAL_VULNS=$(cat audit-results.json | jq '.metadata.vulnerabilities.critical // 0') | |
| if [ "$HIGH_VULNS" -gt 0 ] || [ "$CRITICAL_VULNS" -gt 0 ]; then | |
| echo "⚠️ Security vulnerabilities found:" | |
| echo "High: $HIGH_VULNS, Critical: $CRITICAL_VULNS" | |
| # Send alert | |
| curl -X POST -H 'Content-type: application/json' \ | |
| --data "{\"text\":\"⚠️ Security Alert: $HIGH_VULNS high, $CRITICAL_VULNS critical vulnerabilities found in WorkNow\"}" \ | |
| "${{ secrets.SLACK_WEBHOOK_URL }}" | |
| else | |
| echo "✅ No critical security vulnerabilities found" | |
| fi | |
| # Resource monitoring | |
| resource-monitoring: | |
| runs-on: ubuntu-latest | |
| if: github.event.schedule | |
| steps: | |
| - name: Check server resources | |
| uses: appleboy/ssh-action@v1.0.3 | |
| with: | |
| host: ${{ secrets.PRODUCTION_HOST }} | |
| username: ${{ secrets.PRODUCTION_USER }} | |
| key: ${{ secrets.PRODUCTION_SSH_KEY }} | |
| script: | | |
| # Check disk usage | |
| DISK_USAGE=$(df -h / | awk 'NR==2 {print $5}' | sed 's/%//') | |
| if [ "$DISK_USAGE" -gt 80 ]; then | |
| echo "⚠️ Disk usage is high: ${DISK_USAGE}%" | |
| fi | |
| # Check memory usage | |
| MEMORY_USAGE=$(free | awk 'NR==2{printf "%.0f", $3*100/$2}') | |
| if [ "$MEMORY_USAGE" -gt 80 ]; then | |
| echo "⚠️ Memory usage is high: ${MEMORY_USAGE}%" | |
| fi | |
| # Check CPU load | |
| CPU_LOAD=$(uptime | awk -F'load average:' '{print $2}' | awk '{print $1}' | sed 's/,//') | |
| if (( $(echo "$CPU_LOAD > 2.0" | bc -l) )); then | |
| echo "⚠️ CPU load is high: $CPU_LOAD" | |
| fi | |
| # Check Docker container status | |
| docker ps --format "table {{.Names}}\t{{.Status}}" | grep worknow | |
| echo "✅ Resource monitoring completed" | |
| # Log analysis | |
| log-analysis: | |
| runs-on: ubuntu-latest | |
| if: github.event.schedule | |
| steps: | |
| - name: Analyze application logs | |
| uses: appleboy/ssh-action@v1.0.3 | |
| with: | |
| host: ${{ secrets.PRODUCTION_HOST }} | |
| username: ${{ secrets.PRODUCTION_USER }} | |
| key: ${{ secrets.PRODUCTION_SSH_KEY }} | |
| script: | | |
| # Check for error patterns in logs | |
| ERROR_COUNT=$(docker logs worknow-app --since=1h 2>&1 | grep -i error | wc -l) | |
| if [ "$ERROR_COUNT" -gt 10 ]; then | |
| echo "⚠️ High error count in last hour: $ERROR_COUNT" | |
| # Get recent errors | |
| echo "Recent errors:" | |
| docker logs worknow-app --since=1h 2>&1 | grep -i error | tail -5 | |
| else | |
| echo "✅ Error count is normal: $ERROR_COUNT" | |
| fi | |
| # Check for specific error patterns | |
| docker logs worknow-app --since=1h 2>&1 | grep -E "(timeout|connection refused|database error)" || echo "No critical errors found" | |
| # Notify monitoring summary | |
| notify-summary: | |
| runs-on: ubuntu-latest | |
| needs: | |
| [ | |
| health-check-production, | |
| performance-monitoring, | |
| security-monitoring, | |
| resource-monitoring, | |
| log-analysis, | |
| ] | |
| if: always() && github.event.schedule | |
| steps: | |
| - name: Send monitoring summary | |
| run: | | |
| echo "📊 WorkNow Monitoring Summary" | |
| echo "Health Check: ${{ needs.health-check-production.result }}" | |
| echo "Performance: ${{ needs.performance-monitoring.result }}" | |
| echo "Security: ${{ needs.security-monitoring.result }}" | |
| echo "Resources: ${{ needs.resource-monitoring.result }}" | |
| echo "Logs: ${{ needs.log-analysis.result }}" | |
| echo "Time: $(date)" | |
| if [ "${{ job.status }}" == "success" ]; then | |
| echo "✅ All systems operational" | |
| else | |
| echo "⚠️ Some issues detected - check individual jobs" | |
| fi |