-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfinal_project_solutions.sql
More file actions
206 lines (167 loc) · 8.52 KB
/
final_project_solutions.sql
File metadata and controls
206 lines (167 loc) · 8.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
-- Connect to database (MySQL)
USE maven_advanced_sql;
-- PART I: SCHOOL ANALYSIS
-- TASK 1: View the schools and school details tables
SELECT * FROM schools;
SELECT * FROM school_details;
-- TASK 2: In each decade, how many schools were there that produced players? [Numeric Functions]
SELECT FLOOR(yearID / 10) * 10 AS decade, COUNT(DISTINCT schoolID) AS num_schools
FROM schools
GROUP BY decade
ORDER BY decade;
-- TASK 3: What are the names of the top 5 schools that produced the most players? [Joins]
SELECT sd.name_full, COUNT(DISTINCT s.playerID) AS num_players
FROM schools s LEFT JOIN school_details sd
ON s.schoolID = sd.schoolID
GROUP BY s.schoolID
ORDER BY num_players DESC
LIMIT 5;
-- TASK 4: For each decade, what were the names of the top 3 schools that produced the most players? [Window Functions]
WITH ds AS (SELECT FLOOR(s.yearID / 10) * 10 AS decade, sd.name_full, COUNT(DISTINCT s.playerID) AS num_players
FROM schools s LEFT JOIN school_details sd
ON s.schoolID = sd.schoolID
GROUP BY decade, s.schoolID),
rn AS (SELECT decade, name_full, num_players,
ROW_NUMBER() OVER (PARTITION BY decade ORDER BY num_players DESC) AS row_num
/* ALTERNATIVE SOLUTION UPDATE: ROW_NUMBER will return exactly 3 schools for each decade. To account for ties,
use DENSE_RANK instead to return the top 3 player counts, which could potentially include more than 3 schools */
FROM ds)
SELECT decade, name_full, num_players
FROM rn
WHERE row_num <= 3
ORDER BY decade DESC, row_num;
-- PART II: SALARY ANALYSIS
-- TASK 1: View the salaries table
SELECT * FROM salaries;
-- TASK 2: Return the top 20% of teams in terms of average annual spending [Window Functions]
WITH ts AS (SELECT teamID, yearID, SUM(salary) AS total_spend
FROM salaries
GROUP BY teamID, yearID
ORDER BY teamID, yearID), -- ORDER BY in CTE is not needed and can be omitted
sp AS (SELECT teamID, AVG(total_spend) AS avg_spend,
NTILE(5) OVER (ORDER BY AVG(total_spend) DESC) AS spend_pct
FROM ts
GROUP BY teamID)
SELECT teamID, ROUND(avg_spend / 1000000, 1) AS avg_spend_millions
FROM sp
WHERE spend_pct = 1;
-- TASK 3: For each team, show the cumulative sum of spending over the years [Rolling Calculations]
WITH ts AS (SELECT teamID, yearID, SUM(salary) AS total_spend
FROM salaries
GROUP BY teamID, yearID
ORDER BY teamID, yearID) -- ORDER BY in CTE is not needed and can be omitted
SELECT teamID, yearID,
ROUND(SUM(total_spend) OVER (PARTITION BY teamID ORDER BY yearID) / 1000000, 1)
AS cumulative_sum_millions
FROM ts;
-- TASK 4: Return the first year that each team's cumulative spending surpassed 1 billion [Min / Max Value Filtering]
WITH ts AS (SELECT teamID, yearID, SUM(salary) AS total_spend
FROM salaries
GROUP BY teamID, yearID
ORDER BY teamID, yearID), -- ORDER BY in CTE is not needed and can be omitted
cs AS (SELECT teamID, yearID,
SUM(total_spend) OVER (PARTITION BY teamID ORDER BY yearID)
AS cumulative_sum
FROM ts),
rn AS (SELECT teamID, yearID, cumulative_sum,
ROW_NUMBER() OVER (PARTITION BY teamID ORDER BY cumulative_sum) AS rn
FROM cs
WHERE cumulative_sum > 1000000000)
SELECT teamID, yearID, ROUND(cumulative_sum / 1000000000, 2) AS cumulative_sum_billions
FROM rn
WHERE rn = 1;
-- PART III: PLAYER CAREER ANALYSIS
-- TASK 1: View the players table and find the number of players in the table
SELECT * FROM players;
SELECT COUNT(*) FROM players;
-- TASK 2: For each player, calculate their age at their first (debut) game, their last game,
-- and their career length (all in years). Sort from longest career to shortest career. [Datetime Functions]
SELECT nameGiven,
TIMESTAMPDIFF(YEAR, CAST(CONCAT(birthYear, '-', birthMonth, '-', birthDay) AS DATE), debut)
AS starting_age,
TIMESTAMPDIFF(YEAR, CAST(CONCAT(birthYear, '-', birthMonth, '-', birthDay) AS DATE), finalGame)
AS ending_age,
TIMESTAMPDIFF(YEAR, debut, finalGame) AS career_length
FROM players
ORDER BY career_length DESC;
/* Datetime functions vary widely by RDBMS:
- MySQL: TIMESTAMPDIFF(YEAR, CAST(CONCAT(birthYear, '-', birthMonth, '-', birthDay) AS DATE), debut)
- Oracle: MONTHS_BETWEEN(TO_DATE(birthYear || '-' || birthMonth || '-' || birthDay, 'YYYY-MM-DD'), debut) / 12
- PostgreSQL: DATE_PART('year', debut) - DATE_PART('year', TO_DATE(birthYear || '-' || birthMonth || '-' || birthDay, 'YYYY-MM-DD'))
- SQL Server: DATEDIFF(YEAR, CAST(birthYear + '-' + birthMonth + '-' + birthDay AS DATE), debut)
- SQLite: CAST((STRFTIME('%Y', debut) - STRFTIME('%Y', birthYear || '-' || birthMonth || '-' || birthDay)) AS INTEGER)
*/
-- TASK 3: What team did each player play on for their starting and ending years? [Joins]
SELECT p.nameGiven,
s.yearID AS starting_year, s.teamID AS starting_team,
e.yearID AS ending_year, e.teamID AS ending_team
FROM players p INNER JOIN salaries s
ON p.playerID = s.playerID
AND YEAR(p.debut) = s.yearID
INNER JOIN salaries e
ON p.playerID = e.playerID
AND YEAR(p.finalGame) = e.yearID;
-- TASK 4: How many players started and ended on the same team and also played for over a decade? [Basics]
SELECT p.nameGiven,
s.yearID AS starting_year, s.teamID AS starting_team,
e.yearID AS ending_year, e.teamID AS ending_team
FROM players p INNER JOIN salaries s
ON p.playerID = s.playerID
AND YEAR(p.debut) = s.yearID
INNER JOIN salaries e
ON p.playerID = e.playerID
AND YEAR(p.finalGame) = e.yearID
WHERE s.teamID = e.teamID AND e.yearID - s.yearID > 10;
-- PART IV: PLAYER COMPARISON ANALYSIS
-- TASK 1: View the players table
SELECT * FROM players;
-- TASK 2: Which players have the same birthday? Hint: Look into GROUP_CONCAT / LISTAGG / STRING_AGG [String Functions]
WITH bn AS (SELECT CAST(CONCAT(birthYear, '-', birthMonth, '-', birthDay) AS DATE) AS birthdate,
nameGiven
FROM players)
SELECT birthdate, GROUP_CONCAT(nameGiven SEPARATOR ', ') AS players
FROM bn
WHERE YEAR(birthdate) BETWEEN 1980 AND 1990
GROUP BY birthdate
ORDER BY birthdate;
/* These functions vary by RDBMS:
String concatenation:
- MySQL & SQL Server: CONCAT(birthYear, '-', birthMonth, '-', birthDay)
- Oracle, PostgreSQL & SQLite: birthYear || '-' || birthMonth || '-' || birthDay
Group concatenation:
- MySQL: GROUP_CONCAT(nameGiven SEPARATOR ', ')
- Oracle: LISTAGG(nameGiven, ', ') WITHIN GROUP (ORDER BY nameGiven)
- PostgreSQL: STRING_AGG(nameGiven, ', ' ORDER BY nameGiven)
- SQL Server: STRING_AGG(nameGiven, ', ') WITHIN GROUP (ORDER BY nameGiven)
- SQLite: GROUP_CONCAT(nameGiven, ', ')
*/
-- TASK 3: Create a summary table that shows for each team, what percent of players bat right, left and both [Pivoting]
-- EDIT: This solution doesn't account for duplicate player rows in the salaries table. The DISTINCT solution below is the more accurate one.
SELECT s.teamID,
ROUND(SUM(CASE WHEN p.bats = 'R' THEN 1 ELSE 0 END) / COUNT(s.playerID) * 100, 1) AS bats_right,
ROUND(SUM(CASE WHEN p.bats = 'L' THEN 1 ELSE 0 END) / COUNT(s.playerID) * 100, 1) AS bats_left,
ROUND(SUM(CASE WHEN p.bats = 'B' THEN 1 ELSE 0 END) / COUNT(s.playerID) * 100, 1) AS bats_both
FROM salaries s LEFT JOIN players p
ON s.playerID = p.playerID
GROUP BY s.teamID;
-- EDIT: This is the more accurate solution, which first removes duplicate playerID-teamID combos from the salaries table before pivoting
-- The duplicates exist because each row in the salaries table represents a player's salary on a particular team for a particular year
WITH up AS (SELECT DISTINCT s.teamID, s.playerID, p.bats
FROM salaries s LEFT JOIN players p
ON s.playerID = p.playerID) -- unique players CTE
SELECT teamID,
ROUND(SUM(CASE WHEN bats = 'R' THEN 1 ELSE 0 END) / COUNT(playerID) * 100, 1) AS bats_right,
ROUND(SUM(CASE WHEN bats = 'L' THEN 1 ELSE 0 END) / COUNT(playerID) * 100, 1) AS bats_left,
ROUND(SUM(CASE WHEN bats = 'B' THEN 1 ELSE 0 END) / COUNT(playerID) * 100, 1) AS bats_both
FROM up
GROUP BY teamID;
-- TASK 4: How have average height and weight at debut game changed over the years, and what's the decade-over-decade difference? [Window Functions]
WITH hw AS (SELECT FLOOR(YEAR(debut) / 10) * 10 AS decade,
AVG(height) AS avg_height, AVG(weight) AS avg_weight
FROM players
GROUP BY decade)
SELECT decade,
avg_height - LAG(avg_height) OVER(ORDER BY decade) AS height_diff,
avg_weight - LAG(avg_weight) OVER(ORDER BY decade) AS weight_diff
FROM hw
WHERE decade IS NOT NULL;