diff --git a/pipelines/deploy_dv.yaml b/pipelines/deploy_dv.yaml new file mode 100644 index 00000000..7be7bf5a --- /dev/null +++ b/pipelines/deploy_dv.yaml @@ -0,0 +1,91 @@ +trigger: none + +parameters: + - name: Image + displayName: Build Container Image + type: string + default: "prod" + values: + # - dev + - prod + +variables: + registry: "hcsccrrc.azurecr.io" + repository: "nsp/sdse-spib-hopic-dv" + tag: "$(Build.SourceVersion)" + appName: was-spib-sdse-hopic-dv + subscription: SPIB-SDSE-HOPIC-CICDLZSP-DT + resourceGroup: rg-spib-sdse-hopic-dv + envars: > + DB_HOST=pgsql-spib-sdse-hopic-dv.postgres.database.azure.com + DB_PORT=5432 + DB_SSLMODE=require + SECRET_KEY='@Microsoft.KeyVault(SecretUri=https://kvspibsdsehopicdv.vault.azure.net/secrets/hopic-django-secret-key/)' + WEBSITES_PORT=8000 + LATEST_COMMIT_SHA=$(Build.SourceVersion) + ALLOWED_HOSTS=was-spib-sdse-hopic-dv.azurewebsites.net + CSRF_TRUSTED_ORIGINS=https://was-spib-sdse-hopic-dv.azurewebsites.net + DB_NAME=hopicdb_migration + DB_PASSWORD='@Microsoft.KeyVault(SecretUri=https://kvspibsdsehopicdv.vault.azure.net/secrets/hopicapp-pgsql-password/)' + DB_USER=hopicapp + ENV=dev + AZCOPY_AUTO_LOGIN_TYPE=MSI + + +pool: + name: spib-sdse-hopic-agents-dv + +jobs: + - job: Deploy_DV + steps: + - script: | + sudo apt-get update + sudo apt-get install unzip + displayName: "Install Unzip" + + - script: | + sudo apt install -y docker.io + sudo apt install docker-buildx + sudo systemctl start docker + sudo usermod -aG docker $(id -un) + sudo chmod 666 /var/run/docker.sock + displayName: "Install and Configure Docker" + + - script: | + curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash + displayName: "Install AZ CLI" + + - task: AzureCLI@2 + displayName: "Login to ACR" + inputs: + azureSubscription: $(subscription) + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + az --version + az acr login --name hcsccrrc + + - ${{ if eq(parameters.Image, 'prod')}}: + - script: | + docker build -t $(registry)/$(repository):$(tag) -f server/Dockerfile.prod . + docker push $(registry)/$(repository):$(tag) + displayName: "Build and Push $(repository) Image" + + - task: AzureWebAppContainer@1 + displayName: "Install $(repository) into $(appName)" + inputs: + azureSubscription: "$(subscription)" + appName: "$(appName)" + deployToSlotOrASE: true + resourceGroupName: "$(resourceGroup)" + containers: "$(registry)/$(repository):$(tag)" + containerCommand: "gunicorn --bind 0.0.0.0:8000 server.wsgi --timeout 1000" + + - task: AzureCLI@2 + displayName: "AppSettings for $(appName)" + inputs: + azureSubscription: $(subscription) + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + az webapp config appsettings set -g $(resourceGroup) -n $(appName) --settings ${{ variables.envars }} diff --git a/server/Dockerfile.prod b/server/Dockerfile.prod index 38810aad..c7c5e913 100644 --- a/server/Dockerfile.prod +++ b/server/Dockerfile.prod @@ -1,60 +1,103 @@ -###################### -# DEPENDENCY BUILDER # -###################### -# NOTE: builder layer must mach python and distribution versions of distroless runtime layer! -FROM python:3.11-bookworm as build_env - -# MUST keep these envs in sync with the Dockerfile.prod "FINAL" layer AND with Dockerfile.dev-management -ENV HOME=/cpho -ENV APP_HOME=$HOME/web -ENV PYTHON_DEPS=$HOME/python_deps +# Builds a Prod Image expecting to write to ACR and run the image in an app service container -RUN mkdir "${HOME}" && \ - mkdir "${APP_HOME}" && \ - mkdir "${PYTHON_DEPS}" +########### +# BUILDER # +########### +FROM python:3.11-slim-bookworm as builder -# Update pip -RUN pip install --upgrade pip +# set work directory +WORKDIR /usr/src/app -COPY ./requirements.txt . -COPY ./requirements_dev.txt . -COPY ./requirements_formatting.txt . +# set environment variables +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 -ARG DEPENDENCY_SET="prod" +# Install build dependencies +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + build-essential \ + libpq-dev \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* -RUN [ "${DEPENDENCY_SET}" = "prod" ] \ - && pip install --no-cache-dir --target "${PYTHON_DEPS}" -r requirements.txt \ - || : +RUN pip install --upgrade pip wheel setuptools -RUN [ "${DEPENDENCY_SET}" = "test" ] \ - && pip install --no-cache-dir --target "${PYTHON_DEPS}" -r requirements.txt -r requirements_dev.txt -r requirements_formatting.txt \ - || : +# requirements and wheels +COPY server/requirements.txt . +RUN pip wheel --no-cache-dir --no-deps --wheel-dir /usr/src/app/wheels -r requirements.txt ######### # FINAL # ######### -FROM gcr.io/distroless/python3-debian12 +# pull official base image +FROM python:3.11-slim-bookworm + +# Make sure setuptools is always up to date +RUN pip install --upgrade pip setuptools -# MUST keep these envs in sync with the Dockerfile.prod "DEPENDENCY BUILDER" layer AND with Dockerfile.dev-management -ENV HOME=/cpho +# Environment Variables +ENV APP_NAME=hopicapp +ENV APP_USER=${APP_NAME}user +ENV HOME=/${APP_NAME} ENV APP_HOME=$HOME/web -ENV PYTHON_DEPS=$HOME/python_deps +ENV VIRTUALENV=$HOME/env +ENV WHEELDIR=$HOME/wheels +ENV PATH=$VIRTUALENV/bin:$PATH +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 + +EXPOSE 8000 + +# Install minimal runtime dependencies +RUN apt-get update && \ + apt-get upgrade -y && \ + apt-get install -y --no-install-recommends \ + gosu \ + libpq5 \ + netcat-traditional \ + openssh-server \ + postgresql-common \ + && addgroup --system $APP_NAME \ + && adduser --disabled-password --shell /bin/bash $APP_USER --system --ingroup $APP_NAME --home $HOME \ + && mkdir -p $APP_HOME $VIRTUALENV $WHEELDIR \ + && chown -R $APP_USER:$APP_NAME $HOME \ + && echo "root:Docker!" | chpasswd \ + && echo -e "HOPIC Container\n\nFor management commands run: \ncd $APP_HOME \npython manage\n\n" > /etc/motd \ + && echo -e "cd $APP_HOME\n" >> /etc/profile -# this is the ID of the distroless "nonroot" user, using ID instead of user name because the k8s runAsNonRoot security context -# can't verify non-rootness when the docker file sets user by name -# https://github.com/GoogleContainerTools/distroless/blob/9c5d2c431825d7aa21017551b2ec75c29c1f23c6/common/variables.bzl#L18 -ENV NONROOT_USER_ID=65532 +WORKDIR $APP_HOME -ENV PATH="${PYTHON_DEPS}/bin:${PATH}" -ENV PYTHONPATH="${PYTHON_DEPS}:${PYTHONPATH}" +# Copy wheels from builder stage +COPY --chown=$APP_USER:$APP_NAME --from=builder /usr/src/app/wheels $WHEELDIR -COPY --chown=$NONROOT_USER_ID:$NONROOT_USER_ID --from=build_env $HOME $HOME -COPY --chown=$NONROOT_USER_ID:$NONROOT_USER_ID . $APP_HOME -WORKDIR $APP_HOME -USER $NONROOT_USER_ID +# Create and configure virtual environment +RUN python -m venv $VIRTUALENV && \ + $VIRTUALENV/bin/pip install --upgrade pip && \ + $VIRTUALENV/bin/pip install --no-cache-dir $WHEELDIR/* && \ + $VIRTUALENV/bin/pip install --force-reinstall setuptools && \ + rm -rf $WHEELDIR + +# Copy project files +COPY --chown=$APP_USER:$APP_NAME server/ $APP_HOME + +# copy sshd_config file +COPY server/sshd_config /etc/ssh/ + +# copy version file +# COPY version.txt $APP_HOME + +# Setup entrypoint and create staticfiles +RUN chmod +x $APP_HOME/entrypoint.prod.sh && \ + rm -f $APP_HOME/sshd_config && \ + mkdir -p $APP_HOME/staticfiles && \ + SECRET_KEY=t ALLOWED_HOSTS=* DB_NAME=d DB_USER=d DB_PASSWORD=d DB_HOST=d DB_PORT=1 python -m manage collectstatic --no-input -EXPOSE 8080 +# Set entrypoint and default command +# Wrapping the dynamic entrypoint call to ensure parameters are passed through correctly +RUN echo '#!/bin/bash\n"${APP_HOME}/entrypoint.prod.sh" "$@"' > /entrypoint-wrapper.sh && \ + chmod +x /entrypoint-wrapper.sh -ENTRYPOINT [ "python", "./entrypoint.prod.py" ] +ENTRYPOINT ["/entrypoint-wrapper.sh"] +CMD ["gunicorn", "--bind", "0.0.0.0:8000", "server.wsgi"] diff --git a/server/entrypoint.prod.sh b/server/entrypoint.prod.sh new file mode 100644 index 00000000..7b106f4c --- /dev/null +++ b/server/entrypoint.prod.sh @@ -0,0 +1,25 @@ +#!/bin/bash +echo "Starting SSH server..." +service ssh start + +echo "Generating static files..." +python manage.py collectstatic --no-input + +if [ ! -z "$DB_HOST" ] && [ ! -z "$DB_PORT" ]; then + echo "Waiting for postgres ($DB_HOST:$DB_PORT)..." + + while ! nc -z $DB_HOST $DB_PORT; do + sleep 0.1 + done + sleep 1 + + echo "PostgreSQL started" + echo "applying migrations..." + python manage.py migrate + echo "migrations applied" + +fi + +eval $(printenv | sed -n "s/^\([^=]\+\)=\(.*\)$/export \1=\2/p" | sed 's/"/\\\"/g' | sed '/=/s//="/' | sed 's/$/"/' >> /etc/profile) + +exec gosu ${APP_USER} "$@" diff --git a/server/gunicorn.conf.py b/server/gunicorn.conf.py index 0b3f2d78..f32fff92 100644 --- a/server/gunicorn.conf.py +++ b/server/gunicorn.conf.py @@ -3,8 +3,6 @@ import structlog -from server.open_telemetry_util import instrument_app_for_open_telemetry - # See https://cloud.google.com/run/docs/tips/python#optimize_gunicorn PORT = os.getenv("PORT", "8080") @@ -26,7 +24,7 @@ def post_fork(server, worker): # If NOT using BatchSpanProcessor (likely a bad idea, it's much more performant at run time) you can move instrumentation to wsgi.py and enable preload_app os.environ.setdefault("DJANGO_SETTINGS_MODULE", "server.settings") - worker.flush_telemetry_callback = instrument_app_for_open_telemetry() + # worker.flush_telemetry_callback = instrument_app_for_open_telemetry() def worker_exit(server, worker): diff --git a/server/manage.py b/server/manage.py index 96a699f6..a7eae46d 100755 --- a/server/manage.py +++ b/server/manage.py @@ -5,15 +5,12 @@ import sys from server.config_util import get_project_config -from server.open_telemetry_util import instrument_app_for_open_telemetry def main(): """Run administrative tasks.""" os.environ.setdefault("DJANGO_SETTINGS_MODULE", "server.settings") - instrument_app_for_open_telemetry() - try: from django.core.management import execute_from_command_line except ImportError as exc: diff --git a/server/server/open_telemetry_util.py b/server/server/open_telemetry_util.py deleted file mode 100644 index 8a53ca7e..00000000 --- a/server/server/open_telemetry_util.py +++ /dev/null @@ -1,195 +0,0 @@ -import logging -import os -import sys -import time - -import requests -from opentelemetry import trace -from opentelemetry.exporter.cloud_trace import CloudTraceSpanExporter -from opentelemetry.instrumentation.django import DjangoInstrumentor -from opentelemetry.instrumentation.psycopg2 import Psycopg2Instrumentor -from opentelemetry.propagate import set_global_textmap -from opentelemetry.propagators.cloud_trace_propagator import ( - CloudTraceFormatPropagator, -) -from opentelemetry.resourcedetector.gcp_resource_detector import ( - GoogleCloudResourceDetector, -) -from opentelemetry.sdk.resources import ProcessResourceDetector -from opentelemetry.sdk.trace import TracerProvider, sampling -from opentelemetry.sdk.trace.export import ( - BatchSpanProcessor, - ConsoleSpanExporter, -) -from phac_aspc.django.helpers.logging.utils import ( - add_fields_to_all_logs_for_current_request, -) - -from server.config_util import get_project_config, is_running_tests - -logger = logging.getLogger() - - -def instrument_app_for_open_telemetry(): - config = get_project_config() - - IS_LOCAL = config("IS_LOCAL", cast=bool, default=False) - FORCE_LOCAL_OTEL_BEHAVIOUR = config( - "FORCE_LOCAL_OTEL_BEHAVIOUR", cast=bool, default=False - ) - - if IS_LOCAL or FORCE_LOCAL_OTEL_BEHAVIOUR: - project_id = "local" - - OUTPUT_TELEMETRY_TO_CONSOLE = config( - "OUTPUT_TELEMETRY_TO_CONSOLE", cast=bool, default=False - ) - - span_exporter = ConsoleSpanExporter( - out=( - sys.stdout - if OUTPUT_TELEMETRY_TO_CONSOLE - else open(os.devnull, "w") - ) - ) - - resource = ProcessResourceDetector(raise_on_error=True).detect() - else: - # In Google Cloud, we must request resources information from a metadata server (metadata.google.internal). - # In theory this is consistently reachable across GCP solutions (Cloud Run, App Engine, GKE, etc), - # but in practice there's a big gotcha in GKE. New pods do not immediately have access to the metadata - # server, and may not for a "few seconds" according to the docs linked below. - # https://cloud.google.com/kubernetes-engine/docs/concepts/workload-identity#project_metadata - # - # Open telemetry "resource" information is used to identify the source of a span, and is imutable once - # the corresponding trace provider has been initialized... which all needs to happen before the Django server - # is initialized. This is all not ideal for cold start times! Not the slowest part though, running collect static - # and migrations on pod start is a bigger slow down right now. Retrying the metadata.google.internal request with - # a short linear delay is the best solution for now. Don't bother with exponential backoff because we want to know - # asap and aren't worried about load on the metadata server (in theory; something to keep an eye on in practice). - # - # Note: we directly call metadata.google.internal below for the project ID, which _could_ be passed as - # an env var, _but_ the GoogleCloudResourceDetector call following that also requires metadata server - # access. GoogleCloudResourceDetector doesn't have the logic to wait for the metadata server so we need to - # implement logic to wait for metadata.google.internal access our selves either way. - retry_limit = 12 - retry_delay = 0.25 - - logger.info("Attempting to connect to Google Cloud metadata server...") - project_id = None - for retry_count in range(retry_limit): - try: - project_id = requests.get( - "http://metadata.google.internal/computeMetadata/v1/project/project-id", - headers={"Metadata-Flavor": "Google"}, - ).text - - break - except requests.ConnectionError as error: - if retry_count < retry_limit - 1: - time.sleep(retry_delay) - else: - logger.error(error) - logger.info("Metadata server reachable!") - - span_exporter = CloudTraceSpanExporter( - project_id=project_id, - # resource labels aren't exported in GCP by default, as the labels aren't actually supported - # by Cloud Trace. This regex pattern is used to select resource labels to pick out and convert - # to span attributes - resource_regex=".*", - ) - - # WARNING: you might see examples wrapping a list of resource detectors in - # `opentelemetry.sdk.resources.get_aggregated_resources`. This calls detect() and - # merges the results for you BUT it uses thread pools and may not be suited for all - # prod environments (Cloud Run, small k8s pods, etc). - # Manually call detect and merge as needed instead, not a big deal, this only happens once - # and isn't CPU intensive at all. - # Note: for merge, the order matters with priority given to preceding resource objects - if not project_id: - resource = ProcessResourceDetector(raise_on_error=True).detect() - else: - resource = ( - GoogleCloudResourceDetector(raise_on_error=True).detect() - ).merge(ProcessResourceDetector(raise_on_error=True).detect()) - - # Propagate the X-Cloud-Trace-Context header if present. Add it otherwise - set_global_textmap(CloudTraceFormatPropagator()) - - # A BatchSpanProcessor is significantly better for performance, but has some caveats: - # 1) gunicorn caveat: it uses a worker thread, which means instrumentation calls must happen post-gunicorn - # worker fork, or else multiple gunicron app worker threads will attempt to share one BatchSpanProcessor - # worker (and trip over eachother's process locks). Does not apply if gunicorn workers = threads = 1 - # 2) Cloud Run caveat: GCP docs say NOT to use BatchSpanProcessor in Cloud Run, as Cloud Run "does not - # support background processes". That is a simplification though, what they really mean is that a Cloud Run - # container will lose it's CPU when not actively processing a request, so background processes not tied to - # request handling may not have a chance to immediately finish all their work without interuption. They can - # still resume in the background when the container next receives a request. In the case that a container is - # terminated before receiving a new request, the container receives a SIGTERM signal and 10 seconds of grace time - # with a CPU to wrap things up (https://cloud.google.com/run/docs/container-contract#lifecycle-services). - # This caveat may apply in other auto-scalling environments - # The returned `flush_telemetry_callback` can be used to manage this if your environment requires. - span_processor = BatchSpanProcessor(span_exporter) - - tracer_provider = TracerProvider( - active_span_processor=span_processor, - resource=resource, - # Always sample, even if propagating a trace that wasn't sampled in earlier stages (load balancer, etc). - # This could be too noisy on a busier app, but should be fine for CPHO's expected usage - sampler=sampling.ALWAYS_ON, - ) - - def associate_request_logs_to_telemetry(span, request): - add_fields_to_all_logs_for_current_request( - { - # see https://cloud.google.com/trace/docs/trace-log-integration#associating - # and https://cloud.google.com/logging/docs/structured-logging#special-payload-fields - "logging.googleapis.com/trace": ( - f"projects/{project_id}/traces/{trace.span.format_trace_id(span.get_span_context().trace_id)}" - ), - "logging.googleapis.com/spanId": ( - trace.span.format_span_id(span.get_span_context().span_id) - ), - # This one's awkward, see: https://www.w3.org/TR/trace-context/#sampled-flag - # Right now the only trace flag is the "sampled flag", so `trace_flags` is either 0 or 1; - # the "correct" way to get `trace_sampled` would be `span.get_span_context().trace_flags == 1`, - # but that seems fragile and might not pick up on overrides, like sampler=sampling.ALWAYS_ON? - # `span.is_recording()` doesn't indicate that the _whole_ trace is sampled, but it should - # indicate that the current span within the trace is reporting/being sampled, which is what this - # log field is actually intended for - "logging.googleapis.com/trace_sampled": span.is_recording(), - } - ) - - Psycopg2Instrumentor().instrument( - tracer_provider=tracer_provider, - enable_commenter=True, - commenter_options={}, - # This instrumentor expects the `psycopg2` package. This repo uses the `psycopg2-binary` package. - # Compatible with both, but need to disable the instrumentor's dependency checking - skip_dep_check=True, - ) - DjangoInstrumentor().instrument( - tracer_provider=tracer_provider, - meter_provider=None, # TODO - request_hook=associate_request_logs_to_telemetry, - # GOTCHA: in Cloud Run, if we disable our own instrumentation, I believe it just falls back to using - # the default tracing Google has on Cloud Run instance... so you'll still get generic spans for excluded routes. - # The default tracing is much lighter weight, so disabling does server _some_ purpose. This will also work as - # expected in non-Cloud Run deployments - excluded_urls=config( - "OTEL_PYTHON_DJANGO_EXCLUDED_URLS", default="healthcheck" - ), - # Confusingly named (typo included), when True this actually adds a sqlcommenter django middleware. - # When enabled, trace metadata is inserted as comments in each SQL query, allowing the corresponding logging - # output on the DB side to be associated back to the initiating trace. - # Currently disabled; may have a performance impact and, more importantly, currently causes test_infobase_export.py to fail - is_sql_commentor_enabled=False, - ) - - def flush_telemetry_callback(): - tracer_provider.force_flush() - tracer_provider.shutdown() - - return flush_telemetry_callback diff --git a/server/server/settings.py b/server/server/settings.py index 03d2bf47..d37826e1 100644 --- a/server/server/settings.py +++ b/server/server/settings.py @@ -16,7 +16,7 @@ from django.urls import reverse_lazy -from decouple import Csv +from decouple import Csv, config from server.config_util import get_project_config, is_running_tests @@ -25,7 +25,7 @@ BASE_DIR = Path(__file__).resolve().parent.parent -config = get_project_config() +# config = get_project_config() IS_LOCAL = config("IS_LOCAL", cast=bool, default=False) IS_DEV = config("IS_DEV", cast=bool, default=False) @@ -96,7 +96,7 @@ # Additional CORS allowed and CSRF trusted origins should be empty until if/when the app # is serving a REST/GraphQL API for external consumption CORS_ALLOWED_ORIGINS = [] -CSRF_TRUSTED_ORIGINS = [] +CSRF_TRUSTED_ORIGINS = config("CSRF_TRUSTED_ORIGINS", default="", cast=Csv()) # Prod only security settings if not IS_DEV: diff --git a/server/sshd_config b/server/sshd_config new file mode 100644 index 00000000..c6ceb581 --- /dev/null +++ b/server/sshd_config @@ -0,0 +1,13 @@ +Port 2222 +ListenAddress 0.0.0.0 +LoginGraceTime 180 +X11Forwarding yes +Ciphers aes128-cbc,3des-cbc,aes256-cbc,aes128-ctr,aes192-ctr,aes256-ctr +MACs hmac-sha1,hmac-sha1-96 +StrictModes yes +SyslogFacility DAEMON +PasswordAuthentication yes +PermitEmptyPasswords no +PermitRootLogin yes +Subsystem sftp internal-sftp +