Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions compose/local/django/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ RUN apt-get update && apt-get install --no-install-recommends -y \
libpq-dev \
# Translations dependencies
gettext \
# healthcheck dependencies
procps \
# cleaning up unused files
&& apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false \
&& rm -rf /var/lib/apt/lists/*
Expand Down Expand Up @@ -74,6 +76,10 @@ COPY ./compose/local/django/celery/flower/start /start-flower
RUN sed -i 's/\r$//g' /start-flower
RUN chmod +x /start-flower

# Copy celery scripts directory for healthcheck
COPY ./compose/local/django/celery /celery
RUN chmod +x /celery/healthcheck.sh


# copy application code to WORKDIR
COPY . ${APP_HOME}
Expand Down
46 changes: 46 additions & 0 deletions compose/local/django/celery/healthcheck.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/bin/bash
#
# Celery Worker Healthcheck Script
#
# This script checks if the Celery worker process is running and responsive.
# It uses two checks:
# 1. Process check - is celery worker process running?
# 2. RabbitMQ broker connectivity - can we connect to the broker?
#
# When used with the autoheal container, unhealthy workers will be
# automatically restarted.

set -e

# Check 1: Is the celery worker process running?
if ! pgrep -f "celery.*worker" > /dev/null 2>&1; then
echo "ERROR: Celery worker process not found" >&2
exit 1
fi

# Check 2: Can we connect to RabbitMQ (the broker)?
# Use Python and Celery's connection to test broker connectivity
if command -v python > /dev/null 2>&1; then
# Use Python to test the connection with a timeout
# Access CELERY_BROKER_URL from environment within Python for security
if ! timeout 5 python -c "
import sys
import os
from kombu import Connection
try:
broker_url = os.environ.get('CELERY_BROKER_URL', 'amqp://rabbituser:rabbitpass@rabbitmq:5672/')
conn = Connection(broker_url)
conn.ensure_connection(max_retries=1, timeout=3)
conn.release()
sys.exit(0)
except Exception as e:
print('ERROR: Cannot connect to RabbitMQ broker: {0}'.format(str(e)), file=sys.stderr)
sys.exit(1)
" 2>&1; then
echo "ERROR: Cannot connect to RabbitMQ broker" >&2
exit 1
fi
fi

# All checks passed
exit 0
15 changes: 12 additions & 3 deletions compose/local/django/celery/worker/start
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,20 @@ set -o errexit
set -o pipefail
set -o nounset

# Local development with auto-reload and optional debugging
#
# DEBUGGER=1 - Enable debugpy for remote debugging on port 5679
#
# Worker protections (same as production):
# --max-tasks-per-child=50 - Restart after 50 tasks (prevents memory leaks)
# --max-memory-per-child=4000000 - Restart if memory exceeds 4GB

# Launch VS Code debug server if DEBUGGER environment variable is set to 1
# Note that auto reloading is disabled when debugging, manual restart required for code changes.
if [ "${DEBUGGER:-0}" = "1" ]; then
# exec watchfiles --filter python 'python -m debugpy --listen 0.0.0.0:5679 -m celery -A config.celery_app worker -l INFO'
exec python -Xfrozen_modules=off -m debugpy --listen 0.0.0.0:5679 -m celery -A config.celery_app worker --queues=antenna -l INFO
echo "Starting Celery worker with debugpy on port 5679..."
exec python -Xfrozen_modules=off -m debugpy --listen 0.0.0.0:5679 -m celery -A config.celery_app worker --queues=antenna -l INFO --max-tasks-per-child=50 --max-memory-per-child=4000000
else
exec watchfiles --filter python celery.__main__.main --args '-A config.celery_app worker --queues=antenna -l INFO'
echo "Starting Celery worker with watchfiles auto-reload..."
exec watchfiles --filter python celery.__main__.main --args '-A config.celery_app worker --queues=antenna -l INFO --max-tasks-per-child=50 --max-memory-per-child=4000000'
fi
6 changes: 6 additions & 0 deletions compose/production/django/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ RUN apt-get update && apt-get install --no-install-recommends -y \
libpq-dev \
# Translations dependencies
gettext \
# healthcheck dependencies
procps \
# cleaning up unused files
&& apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false \
&& rm -rf /var/lib/apt/lists/*
Expand Down Expand Up @@ -80,6 +82,10 @@ COPY ./compose/production/django/celery/flower/start /start-flower
RUN sed -i 's/\r$//g' /start-flower
RUN chmod +x /start-flower

# Copy celery scripts directory for healthcheck
COPY --chown=django:django ./compose/production/django/celery /celery
RUN chmod +x /celery/healthcheck.sh


# copy application code to WORKDIR
COPY --chown=django:django . ${APP_HOME}
Expand Down
46 changes: 46 additions & 0 deletions compose/production/django/celery/healthcheck.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/bin/bash
#
# Celery Worker Healthcheck Script (Production)
#
# This script checks if the Celery worker process is running and responsive.
# It uses two checks:
# 1. Process check - is celery worker process running?
# 2. RabbitMQ broker connectivity - can we connect to the broker?
#
# When used with the autoheal container, unhealthy workers will be
# automatically restarted.

set -e

# Check 1: Is the celery worker process running?
if ! pgrep -f "celery.*worker" > /dev/null 2>&1; then
echo "ERROR: Celery worker process not found" >&2
exit 1
fi

# Check 2: Can we connect to RabbitMQ (the broker)?
# Use Python and Celery's connection to test broker connectivity
if command -v python > /dev/null 2>&1; then
# Use Python to test the connection with a timeout
# Access CELERY_BROKER_URL from environment within Python for security
if ! timeout 5 python -c "
import sys
import os
from kombu import Connection
try:
broker_url = os.environ.get('CELERY_BROKER_URL', 'amqp://rabbituser:rabbitpass@rabbitmq:5672/')
conn = Connection(broker_url)
conn.ensure_connection(max_retries=1, timeout=3)
conn.release()
sys.exit(0)
except Exception as e:
print('ERROR: Cannot connect to RabbitMQ broker: {0}'.format(str(e)), file=sys.stderr)
sys.exit(1)
" 2>&1; then
echo "ERROR: Cannot connect to RabbitMQ broker" >&2
exit 1
fi
fi

# All checks passed
exit 0
8 changes: 7 additions & 1 deletion compose/production/django/celery/worker/start
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,10 @@ set -o errexit
set -o pipefail
set -o nounset

exec newrelic-admin run-program celery -A config.celery_app worker --queues=antenna -l INFO
# Production Celery worker with protections
#
# Worker protections:
# --max-tasks-per-child=50 - Restart after 50 tasks (prevents memory leaks)
# --max-memory-per-child=4000000 - Restart if memory exceeds 4GB

exec newrelic-admin run-program celery -A config.celery_app worker --queues=antenna -l INFO --max-tasks-per-child=50 --max-memory-per-child=4000000
35 changes: 35 additions & 0 deletions docker-compose.production.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,28 @@ services:
ports: []
command: /start-celeryworker
restart: always
healthcheck:
test: ["CMD-SHELL", "/celery/healthcheck.sh"]
interval: 30s # Check every 30 seconds
timeout: 15s # Healthcheck must complete within 15s (connection timeout is 5s + overhead)
retries: 3 # Mark unhealthy after 3 consecutive failures (90s total)
start_period: 60s # Grace period during container startup
labels:
- "autoheal=true" # Enable autoheal to restart this container when unhealthy

celerybeat:
<<: *django
ports: []
command: /start-celerybeat
restart: always
healthcheck:
test: ["CMD-SHELL", "pgrep -f 'celery.*beat' > /dev/null || exit 1"]
interval: 60s # Beat is less critical, check every minute
timeout: 10s
retries: 3
start_period: 30s
labels:
- "autoheal=true"

flower:
<<: *django
Expand All @@ -45,6 +61,25 @@ services:
restart: always
volumes:
- ./data/flower/:/data/
healthcheck:
test: ["CMD-SHELL", "python -c \"import urllib.request; urllib.request.urlopen('http://localhost:5555/', timeout=3)\" || exit 1"]
interval: 30s
timeout: 5s
retries: 3
start_period: 30s
labels:
- "autoheal=true"

autoheal:
image: willfarrell/autoheal:latest
container_name: ami_production_autoheal
restart: always
environment:
- AUTOHEAL_CONTAINER_LABEL=autoheal
- AUTOHEAL_INTERVAL=10 # Check container health every 10 seconds
- AUTOHEAL_START_PERIOD=60 # Don't restart containers for 60s after they start
volumes:
- /var/run/docker.sock:/var/run/docker.sock

awscli:
build:
Expand Down
19 changes: 19 additions & 0 deletions docker-compose.worker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,22 @@ services:
ports: []
command: /start-celeryworker
restart: always
healthcheck:
test: ["CMD-SHELL", "/celery/healthcheck.sh"]
interval: 30s # Check every 30 seconds
timeout: 15s # Healthcheck must complete within 15s (connection timeout is 5s + overhead)
retries: 3 # Mark unhealthy after 3 consecutive failures (90s total)
start_period: 60s # Grace period during container startup
labels:
- "autoheal=true" # Enable autoheal to restart this container when unhealthy

autoheal:
image: willfarrell/autoheal:latest
container_name: ami_worker_autoheal
restart: always
environment:
- AUTOHEAL_CONTAINER_LABEL=autoheal
- AUTOHEAL_INTERVAL=10 # Check container health every 10 seconds
- AUTOHEAL_START_PERIOD=60 # Don't restart containers for 60s after they start
volumes:
- /var/run/docker.sock:/var/run/docker.sock
6 changes: 6 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,12 @@ services:
command: /start-celeryworker
depends_on:
- rabbitmq
healthcheck:
test: ["CMD-SHELL", "/celery/healthcheck.sh"]
interval: 30s # Check every 30 seconds
timeout: 15s # Healthcheck must complete within 15s (connection timeout is 5s + overhead)
retries: 3 # Mark unhealthy after 3 consecutive failures (90s total)
start_period: 60s # Grace period during container startup

celerybeat:
<<: *django
Expand Down