Skip to content

Monitoring and Health Checks #7125

Monitoring and Health Checks

Monitoring and Health Checks #7125

Workflow file for this run

name: Monitoring and Health Checks
on:
schedule:
# Run health checks every 5 minutes
- cron: "*/5 * * * *"
workflow_dispatch:
inputs:
environment:
description: "Environment to monitor"
required: true
default: "production"
type: choice
options:
- production
- staging
jobs:
# Health check production
health-check-production:
runs-on: ubuntu-latest
if: github.event.inputs.environment == 'production' || github.event.schedule
steps:
- name: Check production health
run: |
PRODUCTION_URL="${{ secrets.PRODUCTION_URL }}"
# Basic health check
HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" "$PRODUCTION_URL/api/health")
if [ "$HTTP_STATUS" != "200" ]; then
echo "❌ Production health check failed (HTTP $HTTP_STATUS)"
exit 1
fi
# Response time check
RESPONSE_TIME=$(curl -s -o /dev/null -w "%{time_total}" "$PRODUCTION_URL/api/health")
RESPONSE_TIME_MS=$(echo "$RESPONSE_TIME * 1000" | bc)
if (( $(echo "$RESPONSE_TIME_MS > 5000" | bc -l) )); then
echo "⚠️ Production response time is slow: ${RESPONSE_TIME_MS}ms"
else
echo "✅ Production health check passed (${RESPONSE_TIME_MS}ms)"
fi
# Database connectivity check
curl -f "$PRODUCTION_URL/api/health/database" || {
echo "❌ Database connectivity check failed"
exit 1
}
# Redis connectivity check
curl -f "$PRODUCTION_URL/api/health/redis" || {
echo "❌ Redis connectivity check failed"
exit 1
}
- name: Alert on failure
if: failure()
run: |
echo "🚨 Production Health Check Failed!"
echo "Environment: Production"
echo "Time: $(date)"
echo "Branch: ${{ github.ref_name }}"
echo "Please check the production environment immediately!"
# Health check staging
health-check-staging:
runs-on: ubuntu-latest
if: github.event.inputs.environment == 'staging'
steps:
- name: Check staging health
run: |
STAGING_URL="${{ secrets.STAGING_URL }}"
# Basic health check
HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" "$STAGING_URL/api/health")
if [ "$HTTP_STATUS" != "200" ]; then
echo "❌ Staging health check failed (HTTP $HTTP_STATUS)"
exit 1
fi
echo "✅ Staging health check passed"
# Performance monitoring
performance-monitoring:
runs-on: ubuntu-latest
if: github.event.schedule
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Node.js
uses: actions/setup-node@v4
with:
node-version: "20"
cache: "npm"
- name: Install dependencies
run: npm ci
- name: Run performance tests
run: |
# Run basic performance tests
npm run test:performance || echo "Performance tests not configured"
- name: Check application metrics
run: |
PRODUCTION_URL="${{ secrets.PRODUCTION_URL }}"
# Get application metrics
METRICS=$(curl -s "$PRODUCTION_URL/api/metrics" || echo "{}")
# Parse and check key metrics
echo "Application metrics:"
echo "$METRICS"
# Check if metrics are within acceptable ranges
# Add your specific metric checks here
# Security monitoring
security-monitoring:
runs-on: ubuntu-latest
if: github.event.schedule
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Run security audit
run: |
npm audit --audit-level=moderate
- name: Check for known vulnerabilities
run: |
npm audit --audit-level=high --json > audit-results.json
# Check if there are high or critical vulnerabilities
HIGH_VULNS=$(cat audit-results.json | jq '.metadata.vulnerabilities.high // 0')
CRITICAL_VULNS=$(cat audit-results.json | jq '.metadata.vulnerabilities.critical // 0')
if [ "$HIGH_VULNS" -gt 0 ] || [ "$CRITICAL_VULNS" -gt 0 ]; then
echo "⚠️ Security vulnerabilities found:"
echo "High: $HIGH_VULNS, Critical: $CRITICAL_VULNS"
# Send alert
curl -X POST -H 'Content-type: application/json' \
--data "{\"text\":\"⚠️ Security Alert: $HIGH_VULNS high, $CRITICAL_VULNS critical vulnerabilities found in WorkNow\"}" \
"${{ secrets.SLACK_WEBHOOK_URL }}"
else
echo "✅ No critical security vulnerabilities found"
fi
# Resource monitoring
resource-monitoring:
runs-on: ubuntu-latest
if: github.event.schedule
steps:
- name: Check server resources
uses: appleboy/ssh-action@v1.0.3
with:
host: ${{ secrets.PRODUCTION_HOST }}
username: ${{ secrets.PRODUCTION_USER }}
key: ${{ secrets.PRODUCTION_SSH_KEY }}
script: |
# Check disk usage
DISK_USAGE=$(df -h / | awk 'NR==2 {print $5}' | sed 's/%//')
if [ "$DISK_USAGE" -gt 80 ]; then
echo "⚠️ Disk usage is high: ${DISK_USAGE}%"
fi
# Check memory usage
MEMORY_USAGE=$(free | awk 'NR==2{printf "%.0f", $3*100/$2}')
if [ "$MEMORY_USAGE" -gt 80 ]; then
echo "⚠️ Memory usage is high: ${MEMORY_USAGE}%"
fi
# Check CPU load
CPU_LOAD=$(uptime | awk -F'load average:' '{print $2}' | awk '{print $1}' | sed 's/,//')
if (( $(echo "$CPU_LOAD > 2.0" | bc -l) )); then
echo "⚠️ CPU load is high: $CPU_LOAD"
fi
# Check Docker container status
docker ps --format "table {{.Names}}\t{{.Status}}" | grep worknow
echo "✅ Resource monitoring completed"
# Log analysis
log-analysis:
runs-on: ubuntu-latest
if: github.event.schedule
steps:
- name: Analyze application logs
uses: appleboy/ssh-action@v1.0.3
with:
host: ${{ secrets.PRODUCTION_HOST }}
username: ${{ secrets.PRODUCTION_USER }}
key: ${{ secrets.PRODUCTION_SSH_KEY }}
script: |
# Check for error patterns in logs
ERROR_COUNT=$(docker logs worknow-app --since=1h 2>&1 | grep -i error | wc -l)
if [ "$ERROR_COUNT" -gt 10 ]; then
echo "⚠️ High error count in last hour: $ERROR_COUNT"
# Get recent errors
echo "Recent errors:"
docker logs worknow-app --since=1h 2>&1 | grep -i error | tail -5
else
echo "✅ Error count is normal: $ERROR_COUNT"
fi
# Check for specific error patterns
docker logs worknow-app --since=1h 2>&1 | grep -E "(timeout|connection refused|database error)" || echo "No critical errors found"
# Notify monitoring summary
notify-summary:
runs-on: ubuntu-latest
needs:
[
health-check-production,
performance-monitoring,
security-monitoring,
resource-monitoring,
log-analysis,
]
if: always() && github.event.schedule
steps:
- name: Send monitoring summary
run: |
echo "📊 WorkNow Monitoring Summary"
echo "Health Check: ${{ needs.health-check-production.result }}"
echo "Performance: ${{ needs.performance-monitoring.result }}"
echo "Security: ${{ needs.security-monitoring.result }}"
echo "Resources: ${{ needs.resource-monitoring.result }}"
echo "Logs: ${{ needs.log-analysis.result }}"
echo "Time: $(date)"
if [ "${{ job.status }}" == "success" ]; then
echo "✅ All systems operational"
else
echo "⚠️ Some issues detected - check individual jobs"
fi