-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_processing.py
More file actions
69 lines (61 loc) · 5.21 KB
/
data_processing.py
File metadata and controls
69 lines (61 loc) · 5.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import pandas as pd
import os
# Saving all Data
if not os.path.exists('origin_destination_analysis/transformed_data'):
os.makedirs('origin_destination_analysis/transformed_data')
# ORIGIN
# Only making an api call if we don't already have the origin data
if os.path.isfile("origin_destination_analysis/initial_data/origin_data.csv"):
# data was created using this url: https://data.ny.gov/Transportation/MTA-Subway-Origin-Destination-Ridership-Estimate-2/jsu2-fbtj/explore/query/SELECT%0A%20%20%60day_of_week%60%2C%0A%20%20%60hour_of_day%60%2C%0A%20%20%60origin_station_complex_id%60%2C%0A%20%20sum%28%60estimated_average_ridership%60%29%20AS%20%60sum_estimated_average_ridership%60%0AGROUP%20BY%20%60day_of_week%60%2C%20%60hour_of_day%60%2C%20%60origin_station_complex_id%60%0AHAVING%20%60sum_estimated_average_ridership%60%20%3E%3D%200.5/page/column_manager
# ridership needs to be a sum because it's the avg ridership b/n specific pairs, not overall
origin_df = pd.read_csv("origin_destination_analysis/initial_data/origin_data.csv", index_col=0).round(0)
origin_df = origin_df[origin_df['Estimated Average Ridership'] > 0].reset_index()
# averaging the ridership over 7 months
origin_df['Estimated Average Ridership'] = origin_df['Estimated Average Ridership'] / 7
origin_df.to_csv("origin_destination_analysis/transformed_data/origin_data_transformed.csv")
else:
print("The origin data does not exist. Please download from the url to get the data.")
## DESTINATION
# Only making an api call if we don't already have the destination data
if os.path.isfile("origin_destination_analysis/initial_data/destination_data.csv"):
# data was created using this url: https://data.ny.gov/Transportation/MTA-Subway-Origin-Destination-Ridership-Estimate-2/jsu2-fbtj/explore/query/SELECT%0A%20%20%60day_of_week%60%2C%0A%20%20%60hour_of_day%60%2C%0A%20%20%60destination_station_complex_id%60%2C%0A%20%20sum%28%60estimated_average_ridership%60%29%20AS%20%60sum_estimated_average_ridership%60%0AGROUP%20BY%20%60day_of_week%60%2C%20%60hour_of_day%60%2C%20%60destination_station_complex_id%60%0AHAVING%20%60sum_estimated_average_ridership%60%20%3E%3D%200.5/page/filter
# ridership needs to be a sum because it's the avg ridership b/n specific pairs, not overall
destination_df = pd.read_csv("origin_destination_analysis/initial_data/destination_data.csv", index_col=0).round(0)
destination_df = destination_df[destination_df['Estimated Average Ridership'] > 0].reset_index()
# averaging the ridership over 7 months
destination_df['Estimated Average Ridership'] = destination_df['Estimated Average Ridership'] / 7
destination_df.to_csv("origin_destination_analysis/transformed_data/destination_data_transformed.csv")
else:
print("The destination data does not exist. Please download from the url to get the data.")
## STOP NAMES
if os.path.isfile("origin_destination_analysis/initial_data/station_names.csv"):
# data was created using this url: https://data.ny.gov/Transportation/MTA-Subway-Origin-Destination-Ridership-Estimate-2/jsu2-fbtj/explore/query/SELECT%0A%20%20%60origin_station_complex_id%60%2C%0A%20%20%60origin_station_complex_name%60%2C%0A%20%20%60origin_latitude%60%2C%0A%20%20%60origin_longitude%60%0AGROUP%20BY%0A%20%20%60origin_station_complex_id%60%2C%0A%20%20%60origin_station_complex_name%60%2C%0A%20%20%60origin_latitude%60%2C%0A%20%20%60origin_longitude%60/page/aggregate
station_names_df = pd.read_csv("origin_destination_analysis/initial_data/station_names.csv", index_col=0).reset_index()
station_names_df.columns = ['Station Complex ID', 'Station Complex Name', 'Latitude', 'Longitude']
station_names_df.to_csv("origin_destination_analysis/transformed_data/station_names_transformed.csv")
else:
print("The stop names data does not exist. Please download from the url to get the data.")
# attempt at making data into half hour intervals
# # make new datetime
# # split difficult rows into 2
# destination_df_w_time_interval = pd.DataFrame(columns=['Day of Week', 'Hour of Day', 'Destination Station Complex ID',
# 'Estimated Average Ridership', 'Station Complex Name', 'time_interval'])
# for idx in range(len(destination_df[0:2])):
# row_time = destination_df['Hour of Day'][idx]
# if row_time in [6, 9, 15]:
# row_1 = destination_df['Hour of Day'][idx]
# row_1['Estimated Average Ridership'] = row_1['Estimated Average Ridership'] / 2
# row_2
# for row in [row_1, row_2]:
# interval_string = determine_train_time_intervals(row_time
# , destination_df['Day of Week'][idx])
# new_row = row
# new_row['time_interval'] = interval_string
# destination_df_w_time_interval = pd.concat([new_row.to_frame().T, destination_df_w_time_interval])
# else:
# interval_string = determine_train_time_intervals(row_time
# , destination_df['Day of Week'][idx])
# new_row = destination_df.iloc[0]
# new_row['time_interval'] = interval_string
# destination_df_w_time_interval = pd.concat([new_row.to_frame().T, destination_df_w_time_interval])
# # assign interval string to list and add to dataframe