Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 12 additions & 10 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ CLOUDWATCH_FROM_DATE = $(shell ./scripts/determine-cloudwatch-from-date-based-on
CLOUDWATCH_TO_DATE = $(shell date '+%Y-%m-%d')
CLOUDWATCH_TARGET_DIR = ./logs/cloudwatch
CLOUDWATCH_JSONL_FILE = ./logs/ingress.jsonl
CLOUDWATCH_JSONL_GZ_FILE = $(CLOUDWATCH_JSONL_FILE).gz
CLOUDWATCH_JSONL_SCHEMA_FILE = $(CLOUDWATCH_JSONL_FILE).bq-schema.json


Expand Down Expand Up @@ -84,32 +85,33 @@ download-events-from-s3:
"$(CLOUDWATCH_TO_DATE)" \
"$(CLOUDWATCH_TARGET_DIR)"

.convert-cloudwatch-logs-to-jsonl:
./scripts/convert-cloudwatch-logs-to-jsonl.sh \
.convert-gzipped-cloudwatch-logs-to-jsonl-gz:
./scripts/convert-gzipped-cloudwatch-logs-to-jsonl-gz.sh \
"$(CLOUDWATCH_TARGET_DIR)" \
"$(CLOUDWATCH_JSONL_FILE)"
"$(CLOUDWATCH_JSONL_GZ_FILE)"

.generate-schema-for-cloudwatch-jsonl-file: venv
cat "$(CLOUDWATCH_JSONL_FILE)" \
.generate-schema-for-cloudwatch-jsonl-gz-file: venv
cat "$(CLOUDWATCH_JSONL_GZ_FILE)" \
| zcat \
| venv/bin/generate-schema \
> "$(CLOUDWATCH_JSONL_SCHEMA_FILE)"

.upload-ingress-jsonl-to-bigquery:
.upload-ingress-jsonl-gz-to-bigquery:
bq load \
--project_id=elife-data-pipeline \
--noreplace \
--schema="$(CLOUDWATCH_JSONL_SCHEMA_FILE)" \
--schema_update_option=ALLOW_FIELD_ADDITION \
--source_format=NEWLINE_DELIMITED_JSON \
de_proto.sciety_ingress_v1 \
"$(CLOUDWATCH_JSONL_FILE)"
"$(CLOUDWATCH_JSONL_GZ_FILE)"

.do-upload-ingress-logs-from-cloudwatch-to-bigquery:
$(MAKE) .cloudwatch-show-info
$(MAKE) .export-and-download-from-cloudwatch
$(MAKE) .convert-cloudwatch-logs-to-jsonl
$(MAKE) .generate-schema-for-cloudwatch-jsonl-file
$(MAKE) .upload-ingress-jsonl-to-bigquery
$(MAKE) .convert-gzipped-cloudwatch-logs-to-jsonl-gz
$(MAKE) .generate-schema-for-cloudwatch-jsonl-gz-file
$(MAKE) .upload-ingress-jsonl-gz-to-bigquery

.upload-ingress-logs-from-cloudwatch-to-bigquery:
@if [ "$(CLOUDWATCH_FROM_DATE)" = "$(CLOUDWATCH_TO_DATE)" ]; then \
Expand Down
8 changes: 0 additions & 8 deletions scripts/convert-cloudwatch-logs-to-bigquery-jsonl.sh

This file was deleted.

18 changes: 0 additions & 18 deletions scripts/convert-cloudwatch-logs-to-jsonl.sh

This file was deleted.

16 changes: 16 additions & 0 deletions scripts/convert-gzipped-cloudwatch-logs-to-bigquery-jsonl-gz.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash

set -euo pipefail

log_gz_file="$1"
common_filename=${log_gz_file%.*}
target_jsonl_gz_file="$common_filename.jsonl.gz"

echo "converting $log_gz_file to $target_jsonl_gz_file"

cat $log_gz_file \
| zcat \
| sed -e 's/[^ ]* //' \
| jq --compact-output 'del(.kubernetes) | del(.docker)' \
| gzip - \
> $target_jsonl_gz_file
20 changes: 20 additions & 0 deletions scripts/convert-gzipped-cloudwatch-logs-to-jsonl-gz.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/bin/bash

set -euo pipefail

local_cloudwatch_dir="$1"
target_jsonl_gz_file="$2"

if [ -z "${local_cloudwatch_dir}" ] || [ -z "${target_jsonl_gz_file}" ]; then
echo "Usage: $0 <local cloudwatch dir> <target jsonl gz file>"
exit 1
fi

echo "converting ${local_cloudwatch_dir} to ${target_jsonl_gz_file}"

find "${local_cloudwatch_dir}" -type 'f' \
| grep -v jsonl \
| xargs -n 1 ./scripts/convert-gzipped-cloudwatch-logs-to-bigquery-jsonl-gz.sh

echo "combining jsonl files to ${target_jsonl_gz_file}"
(find "${local_cloudwatch_dir}" -type 'f' | grep jsonl.gz | xargs gunzip -c | gzip -) > "${target_jsonl_gz_file}"
1 change: 0 additions & 1 deletion scripts/download-from-cloudwatch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,3 @@ echo "downloading from ${logs_url} to ${target_dir}"

mkdir -p "${target_dir}"
aws s3 cp --recursive ${logs_url} "${target_dir}"
gunzip -r "${target_dir}"