-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgatimeBetweenDates.R
More file actions
273 lines (220 loc) · 9.92 KB
/
gatimeBetweenDates.R
File metadata and controls
273 lines (220 loc) · 9.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
# Fredrik Cederlöf, Head of CX & Analytics Collector Bank
# https://www.linkedin.com/in/fredrikcederlof/
# Blog post https://medium.com/@fredrikcederlof/analyzing-time-between-bookings-and-events-with-r-google-analytics-293f6c334d73
#install(ggplot2)
#install(googleAnalyticsR)
#install(dplyr)
#install(reshape2)
library(googleAnalyticsR)
library(ggplot2)
library(dplyr)
library(reshape2)
# Load Google Analytics Authorization + View ID
view_id <- xxxxxxxxx
ga_auth()
# Create your data frame based on given conditions
gadata <- google_analytics_4(view_id,
date_range = c(Sys.Date() -365, Sys.Date() - 1),
metrics = "users",
dimensions = c("date","transactionId","ga:dimension1","ChannelGrouping"),
filters = c("ga:dimension2==b2c"),
anti_sample = TRUE)
# Convert dimension booking date (Char) to Date
gadata$BookingDate <- as.Date(gadata$dimension1)
# Add a new column for days between booking and the event
gadata$Days <- gadata$BookingDate - gadata$date
# Aggregate days to weeks
gadata$Days <-gadata$Days / 7
# Round digits towards zero
gadata$Days <- round(gadata$Days,digits=0)
# Convert the column from datatype difftime to numeric
gadata$Days <- as.numeric(gadata$Days)
# Rename df column Days to Weeks
rename(gadata, Days = Weeks)
# Instead of dplyr use following code
# names(gadata)[names(gadata) == 'Days'] <- 'Weeks'
# Plot number of weeks between booking and the event with ggplot2
ggplot(data=gadata, aes(gadata$Weeks)) +
geom_histogram(breaks=seq(0, 30, by = 1),
col="#00FA92",
fill="#00C853",
alpha = 0.9) +
labs(title="Weeks between online booking and the event", size=26)+
labs(x="# of Weeks", y="# of Bookings") +
theme_bw() +
theme(panel.border = element_blank(), panel.background = element_blank())+
xlim(0,30)
ylim(0,0)
# Remove Channel Group containing source = (Other)
gadataChannels = gadata[!gadata$C == "(Other)", ]
gadataChannels = data.frame(gadataChannels)
# Visualize the plot with a density line
#geom_histogram(aes(y=..density..),breaks=seq(0, 30, by = 1),
# Replaced it with a new Y-variable and add geom_density
geom_histogram(aes(y=..density..),breaks=seq(0, 30, by = 1),
geom_density(alpha=.3, col="#4294F7", fill="#4294F7")+
# Create a new df (ChannelDF) based on Channel and Booking Days
BookingChannel = gadata$ChannelGrouping
BookingDays = gadata$BookingDate - gadata$date
ChannelDF = data.frame(BookingChannel, BookingDays)
# Convert difftime to num and double to char
ChannelDF$BookingDays <- as.numeric(ChannelDF$BookingDays)
ChannelDF$BookingChannel <- as.character(ChannelDF$BookingChannel)
# Create new df's from each marketing channel
Organic <- ChannelDF[BookingChannel == "Organic Search", ]
Mail <- ChannelDF[BookingChannel == "Email", ]
Direct <- ChannelDF[BookingChannel == "Direct", ]
Referral <- ChannelDF[BookingChannel == "Referral", ]
PaidSearch <- ChannelDF[BookingChannel == "Paid Search", ]
Social <- ChannelDF[BookingChannel == "Social", ]
# Sum all rows in Organic and create the variable SumOrganic
SumOrganic <- nrow(Organic)
# Convert Days to Weeks and round to 1 digit
Organic <- round(Organic$BookingDays/7,digits=1)
# Extract only 0-1 weeks from the variable Organic
OneWeekOrganic <- length(subset(Organic, Organic < 2))
# Calculating share of total
OrganicShare <- OneWeekOrganic/SumOrganic*100
# Create two new vectors with values for each channel
PercentBTE <- c(OrganicShare, MailShare, DirectShare, PaidSearchShare, ReferralShare, SocialShare)
ChannelGroup <-c("Organic", "Mail", "Direct", "Paid Search", "Referral", "Social Share")
# Round percentage variables and concatenate variables to a new df
PercentBTE <- round(PercentBTE,digits=2)
# Construct a df based on the two vectors above
DFCG = data.frame(PercentBTE, ChannelGroup)
# Plot data showing BTE for each channel
ggplot(DFCG, aes(x=ChannelGroup, y=PercentBTE)) +
geom_text(aes(label=PercentBTE), vjust=0) +
geom_bar(stat="identity", fill = "#80c3e5") +
scale_y_continuous(limits = c(0,100)) +
theme_bw() +
theme(panel.border = element_blank(), panel.background = element_blank())+
labs(title="Share of BTE within a 7 days period", size=30) +
labs(x="Channel Groups", y="% BTE within 7 days")
filters = c("ga:deviceCategory==desktop;ga:dimension5==b2c")
# Desktop
DFCG_Desktop = data.frame(PercentBTE, ChannelGroup)
names(DFCG_Desktop)[names(DFCG_Desktop) == 'PercentBTE'] <- 'Desktop'
# Mobile
DFCG_Mobile = data.frame(PercentBTE, ChannelGroup)
names(DFCG_Mobile)[names(DFCG_Mobile) == 'PercentBTE'] <- 'Mobile'
# Tablet
DFCG_Tablet = data.frame(PercentBTE, ChannelGroup)
names(DFCG_Tablet)[names(DFCG_Tablet) == 'PercentBTE'] <- 'Tablet'
# Merge all of the df above to one (DFCG_Device)
DFCG_Device = data.frame(DFCG_Desktop$ChannelGroup, DFCG_Desktop$Desktop, DFCG_Mobile$Mobile, DFCG_Tablet$Tablet)
# Rename df columns
names(DFCG_Device)[names(DFCG_Device) == 'DFCG_Desktop.ChannelGroup'] <- 'ChannelGroup'
names(DFCG_Device)[names(DFCG_Device) == 'DFCG_Desktop.Desktop'] <- 'Desktop'
names(DFCG_Device)[names(DFCG_Device) == 'DFCG_Mobile.Mobile'] <- 'Mobile'
names(DFCG_Device)[names(DFCG_Device) == 'DFCG_Tablet.Tablet'] <- 'Tablet'
# View df
View(DFCG_Device)
DFCG_Device.long<-melt(DFCG_Device)
# Plot data showing BTE for each channel per device category
ggplot(DFCG_Device.long,aes(ChannelGroup,value,fill=variable))+
geom_bar(stat="identity", position = position_dodge(width = 0.6), width=0.5)+
scale_fill_manual("legend", values = c("Desktop" = "#13A1FC", "Mobile" = "#052940", "Tablet" = "#0B507F"))+
geom_text(aes(label = paste(value,"%")),
position = position_dodge(width = 0.5), vjust = -0.4, size=3.5) +
theme_bw() +
theme(panel.border = element_blank(), panel.background = element_blank())+
theme(axis.text = element_text(size = 11))+
theme(axis.title = element_text(size = 13))+
labs(title="Share of BTE per Marketing Channel", size=30) +
labs(x="Channel Groups", y="Share% BTE within 7 days")+
guides(fill=guide_legend(title="Device Category"))
# Create your data frame based on given conditions
df_paidsearch <- google_analytics_4(view_id,
date_range = c("2017-01-01", "2017-12-31"),
metrics = c("transactions"),
dimensions = c("dayOfWeek", "hour", "userGender", "userAgeBracket"),
filters = c("ga:channelGrouping==Paid Search;ga:dimension5==B2C"),
anti_sample = TRUE)
# Plot the df values wit ggplot heatmapdf_paidsearch %>%
ggplot(aes(x=dayOfWeek, userGender, y=hour, fill=transactions)) +
geom_tile( colour = "black")+
geom_text(aes(label = transactions),size=3, lineheight=4, colour="black") +
facet_grid(userGender~userAgeBracket) +
# Re-order and rename X-scale
scale_x_discrete(limits=c(
"1","2","3","4","5","6","0"),
labels=c(
"1" = "Monday",
"2" = "Thuesday",
"3" = "Wednesday",
"4" = "Thursday",
"5" = "Friday",
"6" = "Saturday",
"0" = "Sunday"))+
scale_y_discrete(labels=c(
"00" = "00:00",
"01" = "01:00",
"02" = "02:00",
"03" = "03:00",
"04" = "04:00",
"05" = "05:00",
"06" = "06:00",
"07" = "07:00",
"08" = "08:00",
"09" = "09:00",
"10" = "10:00",
"11" = "11:00",
"12" = "12:00",
"13" = "13:00",
"14" = "14:00",
"15" = "15:00",
"16" = "16:00",
"17" = "17:00",
"18" = "18:00",
"19" = "19:00",
"20" = "20:00",
"21" = "21:00",
"22" = "22:00",
"23" = "23:00"))
# Style color of the heatmap
scale_fill_gradient(low = '#FFFFA6', high = 'red', name="Bookings",limits= c(0,7600)) +
# Declare description texts for title, Y and X scale
labs(title = "BTE within 7 days from Paid Search split by age and gender", x="Weekday", y="Hour of day") +
# Change the appearance and the orientation angle
theme(
axis.text.x = element_text(face="plain", color="black", size=9, angle=90),
axis.text.y = element_text(face="plain", color="black", size=9, angle=0),
axis.line = element_line(colour = "black", size = 0, linetype = "solid"),
strip.text.x = element_text(size=11, angle=0, face="bold"),
strip.text.y = element_text(size=11, face="bold"),
strip.background = element_rect(colour="white", fill="#FFFFFF"),
panel.background = element_blank())
# Determine the advertising cost to gain x revenue
df_ppc_budget <- google_analytics_4(view_id,
date_range = c("2017-08-01", "2018-02-22"),
metrics = c("adCost","transactionRevenue"),
dimensions = c("date"),
filters = c("ga:medium==cpc;ga:dimension5==b2c"),
anti_sample = TRUE)
# Check variables format
str(df_ppc_budget)
# Remove outliers on row 116 - (in this case, black friday)
df_ppc_budget <- df_ppc_budget[-c(116), ]
#Convert date from char to date, as.Date
df_ppc_budget$date <- as.Date(df_ppc_budget$date)
# Check correlation between Cost and Revenue
round(cor(df_paidsearch_budget$adCost, df_paidsearch_budget$transactionRevenue) ,2)
cor.test(df_paidsearch_budget$adCost, df_paidsearch_budget$transactionRevenue)
# Melt adCost with transactionRevenue to one single column
m.df <- melt(df_ppc_budget, id="date")
#Plot the result with a smooth line
ggplot(m.df, aes(x = date, y = value, colour = variable)) +
geom_smooth(size=2)+
theme_minimal() +
scale_x_date(date_breaks = "1 month",
limits = as.Date(c('2017-08-01','2018-02-22')))+
labs(title = "Correlation study between Cost & Revenue)", x="Date", y="Cost & Revenue") +
scale_colour_manual(values=c("#FF7E79", "#97B86C")
# Build linear regression model. Response goes on the left side of the ~, and the predictor(s) on the right
linearMod <- lm(adCost~transactionRevenue,data=df_paidsearch_budget)
sum=summary(linearMod)
#Print the sum
sum
# Call for predicted values for AdCost based on transaction revenue
predict(linearMod, data.frame(transactionRevenue = c(20000, 15000, 10000)))