-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathsync_data.sh
More file actions
executable file
·129 lines (108 loc) · 4.45 KB
/
sync_data.sh
File metadata and controls
executable file
·129 lines (108 loc) · 4.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/bin/bash
# Command :
# - ./sync_data.sh [date] [pipeline1,pipeline2,...]
# - ./sync_data.sh 2022-08-10 (for date with no specific pipelines)
# - ./sync_data.sh 2022-08-10 user_daily_visits,subscriptions (for specific pipelines)
# Note : DATABASE_URL is automatically created on scalingo machines.
# For testing :
# DATABASE_URL=${DATABASE_URL:-'postgresql://stats:stats@localhost:5432/stats'}
# echo $DATABASE_URL
echo "Starting job. Should display 'Done' when done, if there were no errors."
# only for local dev
if [ -f .env ]; then
source .env
fi
extract_date=$1 # date format ie. 2022-08-10
if [ -z "$extract_date" ]
then
##extract_date=`date +'%Y-%m-%d'`
#use yersterday date to be sure that the data has arrived from the previous pipeline
extract_date=$(date -d '1 day ago' +'%Y-%m-%d')
fi
echo "Extract date : $extract_date"
selected_pipelines=$2
if [ -z "$selected_pipelines" ]; then
selected_pipelines="*"
fi
echo "Selected pipeline : $selected_pipelines"
echo "Connecting to Database : $DATABASE_URL"
# Set up DB - create table and index
psql -d $DATABASE_URL -f scripts/tables.sql
user_daily_visits_pipeline() {
echo "======== STARTING user_daily_visits PIPELINE AT $(date) ========"
echo "Fetch S3 user_daily_visits $extract_date"
time ./fetch_from_s3.sh user_daily_visits $extract_date
echo "Insert User Daily Visits"
time psql -d $DATABASE_URL -f scripts/insert_user_daily_visits_data.sql
time ./sync_enrich_user_daily_visits.sh
time ./sync_views_user_daily_visits.sh
time ./sync_large_views_user_daily_visits.sh
time ./sync_check_data_user_daily_visits.sh
echo "======== FINISHED user_daily_visits PIPELINE AT $(date) ========"
}
subscriptions_pipeline() {
echo "======== STARTING subscriptions PIPELINE AT $(date) ========"
echo "Fetch S3 subscriptions_aggregate"
time ./fetch_from_s3.sh subscriptions_aggregate $extract_date
echo "Insert Subscriptions"
time psql -d $DATABASE_URL -f scripts/insert_subscriptions_data.sql
echo "======== FINISHED subscriptions PIPELINE AT $(date) ========"
}
events_pipeline() {
echo "======== STARTING events PIPELINE AT $(date) ========"
echo "Fetch S3 events_roomv9_aggregate"
time ./fetch_from_s3.sh events_roomv9_aggregate $extract_date
echo "Insert Events roomv9"
time psql -d $DATABASE_URL -f scripts/insert_events_roomv9_data.sql
echo "======== FINISHED events PIPELINE AT $(date) ========"
}
pushers_pipeline() {
echo "======== STARTING pushers PIPELINE AT $(date) ========"
echo "Fetch S3 pushers $extract_date"
time ./fetch_from_s3.sh pushers $extract_date
echo "Insert Pushers"
time psql -d $DATABASE_URL -f scripts/insert_pushers_data.sql
echo "======== FINISHED pushers PIPELINE AT $(date) ========"
}
account_data_pipeline() {
echo "======== STARTING account_data PIPELINE AT $(date) ========"
echo "Fetch S3 account_data $extract_date"
time ./fetch_from_s3.sh account_data $extract_date
echo "Insert Account Data"
time psql -d $DATABASE_URL -f scripts/insert_account_data_data.sql
echo "======== FINISHED account_data PIPELINE AT $(date) ========"
}
sso_pipeline() {
echo "======== STARTING sso PIPELINE AT $(date) ========"
echo "Fetch S3 sso $extract_date"
time ./fetch_from_s3.sh sso $extract_date
echo "Insert SSO Data"
time psql -d $DATABASE_URL -f scripts/insert_sso_data.sql
echo "======== FINISHED sso PIPELINE AT $(date) ========"
}
crisp_pipeline() {
echo "======== STARTING crisp PIPELINE AT $(date) ========"
echo "Fetch S3 crisp_conversation_segments $extract_date"
time ./fetch_from_s3.sh crisp_conversation_segments $extract_date
echo "Insert crisp conversations segments"
time psql -d $DATABASE_URL -f scripts/insert_crisp_conversation_segments.sql
echo "======== FINISHED crisp PIPELINE AT $(date) ========"
}
execute_pipeline() {
pipeline=$1
if [[ "$selected_pipelines" == "*" || "$selected_pipelines" == *"$pipeline"* ]]; then
echo "Executing $pipeline pipeline..."
$pipeline"_pipeline"
else
echo "Skipping $pipeline pipeline..."
fi
}
psql -d $DATABASE_URL -f scripts/tables.sql
execute_pipeline "user_daily_visits"
execute_pipeline "subscriptions"
execute_pipeline "events"
#execute_pipeline "pushers" deactivate pushers because it takes 40Go and it is not really used
execute_pipeline "account_data"
execute_pipeline "sso"
execute_pipeline "crisp"
echo "======== JOB COMPLETED AT $(date) ========"