tchap-stats/sync_data.sh at main · tchapgouv/tchap-stats · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/bin/bash
# Command :

# - ./sync_data.sh [date] [pipeline1,pipeline2,...]
# - ./sync_data.sh 2022-08-10 (for date with no specific pipelines)
# - ./sync_data.sh 2022-08-10 user_daily_visits,subscriptions (for specific pipelines)

# Note : DATABASE_URL is automatically created on scalingo machines.
# For testing :
# DATABASE_URL=${DATABASE_URL:-'postgresql://stats:stats@localhost:5432/stats'}
# echo $DATABASE_URL
echo "Starting job. Should display 'Done' when done, if there were no errors."

# only for local dev
if [ -f .env ]; then
  source .env
fi

extract_date=$1 # date format ie. 2022-08-10
if [ -z "$extract_date" ]
then
    ##extract_date=`date +'%Y-%m-%d'`
    #use yersterday date to be sure that the data has arrived from the previous pipeline
    extract_date=$(date -d '1 day ago' +'%Y-%m-%d')
fi

echo "Extract date : $extract_date"

selected_pipelines=$2

if [ -z "$selected_pipelines" ]; then
    selected_pipelines="*"
fi

echo "Selected pipeline : $selected_pipelines"

echo "Connecting to Database : $DATABASE_URL"

# Set up DB - create table and index
psql -d $DATABASE_URL -f scripts/tables.sql

user_daily_visits_pipeline() {
    echo "======== STARTING user_daily_visits PIPELINE AT $(date) ========"
    echo "Fetch S3 user_daily_visits $extract_date"
    time ./fetch_from_s3.sh user_daily_visits $extract_date
    echo "Insert User Daily Visits"
    time psql -d $DATABASE_URL -f scripts/insert_user_daily_visits_data.sql
    time ./sync_enrich_user_daily_visits.sh
    time ./sync_views_user_daily_visits.sh
    time ./sync_large_views_user_daily_visits.sh
    time ./sync_check_data_user_daily_visits.sh
    echo "======== FINISHED user_daily_visits PIPELINE AT $(date) ========"
}

subscriptions_pipeline() {
    echo "======== STARTING subscriptions PIPELINE AT $(date) ========"
    echo "Fetch S3 subscriptions_aggregate"
    time ./fetch_from_s3.sh subscriptions_aggregate $extract_date
    echo "Insert Subscriptions"
    time psql -d $DATABASE_URL -f scripts/insert_subscriptions_data.sql
    echo "======== FINISHED subscriptions PIPELINE AT $(date) ========"
}

events_pipeline() {
    echo "======== STARTING events PIPELINE AT $(date) ========"
    echo "Fetch S3 events_roomv9_aggregate"
    time ./fetch_from_s3.sh events_roomv9_aggregate $extract_date
    echo "Insert Events roomv9"
    time psql -d $DATABASE_URL -f scripts/insert_events_roomv9_data.sql
    echo "======== FINISHED events PIPELINE AT $(date) ========"
}

pushers_pipeline() {
    echo "======== STARTING pushers PIPELINE AT $(date) ========"
    echo "Fetch S3 pushers $extract_date"
    time ./fetch_from_s3.sh pushers $extract_date
    echo "Insert Pushers"
    time psql -d $DATABASE_URL -f scripts/insert_pushers_data.sql
    echo "======== FINISHED pushers PIPELINE AT $(date) ========"
}

account_data_pipeline() {
    echo "======== STARTING account_data PIPELINE AT $(date) ========"
    echo "Fetch S3 account_data $extract_date"
    time ./fetch_from_s3.sh account_data $extract_date
    echo "Insert Account Data"
    time psql -d $DATABASE_URL -f scripts/insert_account_data_data.sql
    echo "======== FINISHED account_data PIPELINE AT $(date) ========"
}

sso_pipeline() {
    echo "======== STARTING sso PIPELINE AT $(date) ========"
    echo "Fetch S3 sso $extract_date"
    time ./fetch_from_s3.sh sso $extract_date
    echo "Insert SSO Data"
    time psql -d $DATABASE_URL -f scripts/insert_sso_data.sql
    echo "======== FINISHED sso PIPELINE AT $(date) ========"
}

crisp_pipeline() {
    echo "======== STARTING crisp PIPELINE AT $(date) ========"
    echo "Fetch S3 crisp_conversation_segments $extract_date"
    time ./fetch_from_s3.sh crisp_conversation_segments $extract_date
    echo "Insert crisp conversations segments"
    time psql -d $DATABASE_URL -f scripts/insert_crisp_conversation_segments.sql
    echo "======== FINISHED crisp PIPELINE AT $(date) ========"
}

execute_pipeline() {
    pipeline=$1
    if [[ "$selected_pipelines" == "*" || "$selected_pipelines" == *"$pipeline"* ]]; then
        echo "Executing $pipeline pipeline..."
        $pipeline"_pipeline"
    else
        echo "Skipping $pipeline pipeline..."
    fi
}

psql -d $DATABASE_URL -f scripts/tables.sql

execute_pipeline "user_daily_visits"
execute_pipeline "subscriptions"
execute_pipeline "events"
#execute_pipeline "pushers" deactivate pushers because it takes 40Go and it is not really used
execute_pipeline "account_data"
execute_pipeline "sso"
execute_pipeline "crisp"

echo "======== JOB COMPLETED AT $(date) ========"