From cb2e5924ab40c431fe93afb7071fafded0ad0de4 Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 16 Mar 2021 22:08:58 -0400 Subject: [PATCH 1/5] added aggregate design --- src/covidify/data_prep.py | 301 ++++++++++++++++++++------------------ src/covidify/forecast.py | 109 +++++++------- 2 files changed, 219 insertions(+), 191 deletions(-) diff --git a/src/covidify/data_prep.py b/src/covidify/data_prep.py index 6153d44..9b1ecd7 100644 --- a/src/covidify/data_prep.py +++ b/src/covidify/data_prep.py @@ -51,180 +51,199 @@ ############ COUNTRY SELECTION ############ - -def get_similar_countries(c, country_list): - pos_countries = get_close_matches(c, country_list) - - if len(pos_countries) > 0: - print('\033[1;31m'+c, 'was not listed. did you mean', pos_countries[0].capitalize() + '?\033[0;0m') +class CountrySelection: + #Aggregate Root + def __init__ (c, country_list): + self.c = c + self.country_list = country_list + + def get_similar_countries(c, country_list): + pos_countries = get_close_matches(c, country_list) - #Only delete if its a covidify generated folder - if 'Desktop/covidify-output-' in out: - os.system('rm -rf ' + out) - sys.exit(1) - else: - print('\033[1;31m'+c, 'was not listed.\033[0;0m') - if 'Desktop/covidify-output-' in out: - os.system('rm -rf ' + out) - sys.exit(1) - -def check_specified_country(df, country): - ''' - let user filter reports by country, if not found - then give a option if the string is similar - ''' - - # Get all unique countries in the data - country_list = list(map(lambda x:x.lower().strip(), set(df.country.values))) - - if country: - print('Country specified!') - if country.lower() == 'Mainland China': #Mainland china and china doesn't come up as similar - print(country, 'was not listed. did you mean China?') - sys.exit(1) - # give similar option if similarity found - if country.lower() not in country_list: - get_similar_countries(country, country_list) + if len(pos_countries) > 0: + print('\033[1;31m'+c, 'was not listed. did you mean', pos_countries[0].capitalize() + '?\033[0;0m') + #Only delete if its a covidify generated folder + if 'Desktop/covidify-output-' in out: + os.system('rm -rf ' + out) + sys.exit(1) else: - #Return filtered dataframe - print('... filtering data for', country) - if len(country) == 2: - df = df[df.country == country.upper()] + print('\033[1;31m'+c, 'was not listed.\033[0;0m') + if 'Desktop/covidify-output-' in out: + os.system('rm -rf ' + out) + sys.exit(1) + + def check_specified_country(df, country): + ''' + let user filter reports by country, if not found + then give a option if the string is similar + ''' + + # Get all unique countries in the data + country_list = list(map(lambda x:x.lower().strip(), set(df.country.values))) + + if country: + print('Country specified!') + if country.lower() == 'Mainland China': #Mainland china and china doesn't come up as similar + print(country, 'was not listed. did you mean China?') + sys.exit(1) + # give similar option if similarity found + if country.lower() not in country_list: + get_similar_countries(country, country_list) + else: - df = df[df.country == capwords(country)] + #Return filtered dataframe + print('... filtering data for', country) + if len(country) == 2: + df = df[df.country == country.upper()] + else: + df = df[df.country == capwords(country)] + return df + else: + print('... No specific country specified') return df - else: - print('... No specific country specified') - return df -df = check_specified_country(df, country) + df = check_specified_country(df, country) ############ DAILY CASES ############ +class DailyCases: + #Aggregate Root + def __init__ (tmp, col): + self.tmp = tmp + self.col = col -# sheets need to be sorted by date value -# print('Sorting by datetime...') -df = df.sort_values('datetime') + # sheets need to be sorted by date value + # print('Sorting by datetime...') + df = df.sort_values('datetime') -current_date = str(datetime.date(datetime.now())) + current_date = str(datetime.date(datetime.now())) -''' -Get the difference of the sum totals for each -date and plot them on a trendline graph -''' -def get_new_cases(tmp, col): - diff_list = [] - tmp_df_list = [] - df = tmp.copy() + ''' + Get the difference of the sum totals for each + date and plot them on a trendline graph + ''' + def get_new_cases(tmp, col): + diff_list = [] + tmp_df_list = [] + df = tmp.copy() - for i, day in enumerate(df.sort_values('file_date').file_date.unique()): - tmp_df = df[df.file_date == day] - tmp_df_list.append(tmp_df[col].sum()) + for i, day in enumerate(df.sort_values('file_date').file_date.unique()): + tmp_df = df[df.file_date == day] + tmp_df_list.append(tmp_df[col].sum()) - if i == 0: - diff_list.append(tmp_df[col].sum()) - else: - diff_list.append(tmp_df[col].sum() - tmp_df_list[i-1]) + if i == 0: + diff_list.append(tmp_df[col].sum()) + else: + diff_list.append(tmp_df[col].sum() - tmp_df_list[i-1]) - return diff_list + return diff_list -def get_moving_average(tmp, col): - df = tmp.copy() - return df[col].rolling(window=2).mean() + def get_moving_average(tmp, col): + df = tmp.copy() + return df[col].rolling(window=2).mean() -def get_exp_moving_average(tmp, col): - df = tmp.copy() - return df[col].ewm(span=2, adjust=True).mean() + def get_exp_moving_average(tmp, col): + df = tmp.copy() + return df[col].ewm(span=2, adjust=True).mean() -print('... Calculating dataframe for new cases') -daily_cases_df = pd.DataFrame([]) -daily_cases_df['date'] = df.file_date.unique() -daily_cases_df = daily_cases_df.sort_values('date') -daily_cases_df['new_confirmed_cases'] = get_new_cases(df, 'confirmed') -daily_cases_df['new_deaths'] = get_new_cases(df, 'deaths') -daily_cases_df['new_recoveries'] = get_new_cases(df, 'recovered') -daily_cases_df['cumulative_cases'] = daily_cases_df.new_confirmed_cases.cumsum() -daily_cases_df.insert(loc=0, column='day', value=np.arange(0, len(daily_cases_df))) + print('... Calculating dataframe for new cases') + daily_cases_df = pd.DataFrame([]) + daily_cases_df['date'] = df.file_date.unique() + daily_cases_df = daily_cases_df.sort_values('date') + daily_cases_df['new_confirmed_cases'] = get_new_cases(df, 'confirmed') + daily_cases_df['new_deaths'] = get_new_cases(df, 'deaths') + daily_cases_df['new_recoveries'] = get_new_cases(df, 'recovered') + daily_cases_df['cumulative_cases'] = daily_cases_df.new_confirmed_cases.cumsum() + daily_cases_df.insert(loc=0, column='day', value=np.arange(0, len(daily_cases_df))) -''' -Calculate the number of people that are ACTUALLY infected on a given day -currently infected = sum of people date - (recovored + died) -ex: 5 = 10 - (4 - 1) + ''' + Calculate the number of people that are ACTUALLY infected on a given day + currently infected = sum of people date - (recovored + died) + ex: 5 = 10 - (4 - 1) -''' -current_infected = pd.DataFrame([]) -current_infected['currently_infected'] = (df.groupby('file_date').confirmed.sum() - (df.groupby('file_date').deaths.sum() + df.groupby('file_date').recovered.sum())) -current_infected['delta'] = (current_infected['currently_infected'] - df.groupby('file_date').confirmed.sum()) -current_infected.index.rename('date', inplace=True) + ''' + current_infected = pd.DataFrame([]) + current_infected['currently_infected'] = (df.groupby('file_date').confirmed.sum() - (df.groupby('file_date').deaths.sum() + df.groupby('file_date').recovered.sum())) + current_infected['delta'] = (current_infected['currently_infected'] - df.groupby('file_date').confirmed.sum()) + current_infected.index.rename('date', inplace=True) -daily_cases_df = pd.merge(daily_cases_df, current_infected, how='outer', on='date') + daily_cases_df = pd.merge(daily_cases_df, current_infected, how='outer', on='date') ############ LOG DATA ############ +class LogData: + #Aggregate Root + def __init__ (data, country): + self.data = data + self.country = country + + print('Calculating data for logarithmic plotting...') + if not country: + print('... top infected countries: {}'.format(top)) + + def get_top_countries(data): + # Get top N infected countries + tmp_df = data.copy() + tmp_df = tmp_df[tmp_df.file_date == df.file_date.max()] + return tmp_df.groupby(['country']).agg({'confirmed': 'sum'}).sort_values('confirmed',ascending=False).head(top).index + + TOP_N_COUNTRIES = get_top_countries(df) + + tmp_df = df[df.country.isin(TOP_N_COUNTRIES)].copy() + + def get_day_counts(d, country): + ''' + For each country, get the days of the spread since 500 + cases + ''' + data = d.copy() + result_df = pd.DataFrame([]) + result_df = data.groupby(['file_date']).agg({'confirmed': 'sum', + 'recovered': 'sum', + 'deaths': 'sum'}) + result_df['date'] = data['file_date'].unique() + result_df['country'] = country + + result_df = result_df[result_df.confirmed >= 500] + result_df.insert(loc=0, column='day', value=np.arange(len(result_df))) + return result_df -print('Calculating data for logarithmic plotting...') -if not country: - print('... top infected countries: {}'.format(top)) - -def get_top_countries(data): - # Get top N infected countries - tmp_df = data.copy() - tmp_df = tmp_df[tmp_df.file_date == df.file_date.max()] - return tmp_df.groupby(['country']).agg({'confirmed': 'sum'}).sort_values('confirmed',ascending=False).head(top).index - -TOP_N_COUNTRIES = get_top_countries(df) - -tmp_df = df[df.country.isin(TOP_N_COUNTRIES)].copy() + df_list = [] -def get_day_counts(d, country): - ''' - For each country, get the days of the spread since 500 - cases - ''' - data = d.copy() - result_df = pd.DataFrame([]) - result_df = data.groupby(['file_date']).agg({'confirmed': 'sum', - 'recovered': 'sum', - 'deaths': 'sum'}) - result_df['date'] = data['file_date'].unique() - result_df['country'] = country + for country in TOP_N_COUNTRIES: + print(' ...', country + ': ' + str(tmp_df[(tmp_df.file_date == df.file_date.max()) & + (tmp_df.country == country)].confirmed.sum())) + df_list.append(get_day_counts(tmp_df[tmp_df.country == country], country)) - result_df = result_df[result_df.confirmed >= 500] - result_df.insert(loc=0, column='day', value=np.arange(len(result_df))) - return result_df - -df_list = [] - -for country in TOP_N_COUNTRIES: - print(' ...', country + ': ' + str(tmp_df[(tmp_df.file_date == df.file_date.max()) & - (tmp_df.country == country)].confirmed.sum())) - df_list.append(get_day_counts(tmp_df[tmp_df.country == country], country)) - -log_df = pd.concat(df_list, axis=0, ignore_index=True) + log_df = pd.concat(df_list, axis=0, ignore_index=True) ############ SAVE DATA ############ #Create date of extraction folder -data_folder = os.path.join('data', str(datetime.date(datetime.now()))) -save_dir = os.path.join(out, data_folder) +class SaveData: + #aggregate root + def __init__ (): + + data_folder = os.path.join('data', str(datetime.date(datetime.now()))) + save_dir = os.path.join(out, data_folder) -if not os.path.exists(save_dir): - os.system('mkdir -p ' + save_dir) + if not os.path.exists(save_dir): + os.system('mkdir -p ' + save_dir) -print('Creating subdirectory for data...') -print('...', save_dir) + print('Creating subdirectory for data...') + print('...', save_dir) -print('Saving...') -csv_file_name = 'agg_data_{}.csv'.format(datetime.date(datetime.now())) -df.astype(str).to_csv(os.path.join(save_dir, csv_file_name)) -print('...', csv_file_name) + print('Saving...') + csv_file_name = 'agg_data_{}.csv'.format(datetime.date(datetime.now())) + df.astype(str).to_csv(os.path.join(save_dir, csv_file_name)) + print('...', csv_file_name) -daily_cases_file_name = 'trend_{}.csv'.format(datetime.date(datetime.now())) -daily_cases_df.astype(str).to_csv(os.path.join(save_dir, daily_cases_file_name)) -print('...', daily_cases_file_name) + daily_cases_file_name = 'trend_{}.csv'.format(datetime.date(datetime.now())) + daily_cases_df.astype(str).to_csv(os.path.join(save_dir, daily_cases_file_name)) + print('...', daily_cases_file_name) -log_file_name = 'log_{}.csv'.format(datetime.date(datetime.now())) -log_df.astype(str).to_csv(os.path.join(save_dir, log_file_name)) -print('...', log_file_name) + log_file_name = 'log_{}.csv'.format(datetime.date(datetime.now())) + log_df.astype(str).to_csv(os.path.join(save_dir, log_file_name)) + print('...', log_file_name) -print('Done!') \ No newline at end of file + print('Done!') \ No newline at end of file diff --git a/src/covidify/forecast.py b/src/covidify/forecast.py index e96ad00..7e0da88 100644 --- a/src/covidify/forecast.py +++ b/src/covidify/forecast.py @@ -60,57 +60,66 @@ print('Creating reports folder...') os.system('mkdir -p ' + image_dir) - -def plot_forecast(tmp_df, train, index_forecast, forecast, confint): - ''' - Plot the values of train and test, the predictions from ARIMA and the shadowing - for the confidence interval. - - ''' - - # For shadowing - lower_series = pd.Series(confint[:, 0], index=index_forecast) - upper_series = pd.Series(confint[:, 1], index=index_forecast) - - print('... saving graph') - fig, ax = plt.subplots(figsize=FIG_SIZE) - plt.title('ARIMA - Prediction for cumalitive case counts {} days in the future'.format(days_in_future)) - plt.plot(tmp_df.cumulative_cases, label='Train',marker='o') - plt.plot(tmp_df.pred, label='Forecast', marker='o') - tmp_df.groupby('date')[['']].sum().plot(ax=ax) - plt.fill_between(index_forecast, - upper_series, - lower_series, - color='k', alpha=.1) - plt.ylabel('Infections') - plt.xlabel('Date') - fig.legend().set_visible(True) - fig = ax.get_figure() - fig.savefig(os.path.join(image_dir, 'cumulative_forecasts.png')) - - -def forecast(tmp_df, train, index_forecast, days_in_future): - - # Fit model with training data - model = auto_arima(train, trace=False, error_action='ignore', suppress_warnings=True) - model_fit = model.fit(train) +class Forecast: + #Aggregate root + __init__ (tmp_df, train, index_forecast, forecast, confint, days_in_future): + self.tmp_df = tmp_df + self.train = train + self.index_forecast = index_forecast + self.forecast = forecast + self.confint = confint + self.days_in_future = days_in_future + + def plot_forecast(tmp_df, train, index_forecast, forecast, confint): + ''' + Plot the values of train and test, the predictions from ARIMA and the shadowing + for the confidence interval. - forecast, confint = model_fit.predict(n_periods=len(index_forecast), return_conf_int=True) + ''' - forecast_df = pd.concat([tmp_df, pd.DataFrame(forecast, index = index_forecast, columns=['pred'])], axis=1, sort=False) - date_range = [d.strftime('%Y-%m-%d') for d in pd.date_range(train_start, forecast_end)] - forecast_df['date'] = pd.Series(date_range).astype(str) - forecast_df[''] = None # Dates get messed up, so need to use pandas plotting + # For shadowing + lower_series = pd.Series(confint[:, 0], index=index_forecast) + upper_series = pd.Series(confint[:, 1], index=index_forecast) + + print('... saving graph') + fig, ax = plt.subplots(figsize=FIG_SIZE) + plt.title('ARIMA - Prediction for cumalitive case counts {} days in the future'.format(days_in_future)) + plt.plot(tmp_df.cumulative_cases, label='Train',marker='o') + plt.plot(tmp_df.pred, label='Forecast', marker='o') + tmp_df.groupby('date')[['']].sum().plot(ax=ax) + plt.fill_between(index_forecast, + upper_series, + lower_series, + color='k', alpha=.1) + plt.ylabel('Infections') + plt.xlabel('Date') + fig.legend().set_visible(True) + fig = ax.get_figure() + fig.savefig(os.path.join(image_dir, 'cumulative_forecasts.png')) + + + def forecast(tmp_df, train, index_forecast, days_in_future): - # Save Model and file - print('... saving file:', forecast_file) - forecast_df.to_csv(os.path.join(data_dir, forecast_file)) + # Fit model with training data + model = auto_arima(train, trace=False, error_action='ignore', suppress_warnings=True) + model_fit = model.fit(train) + + forecast, confint = model_fit.predict(n_periods=len(index_forecast), return_conf_int=True) + + forecast_df = pd.concat([tmp_df, pd.DataFrame(forecast, index = index_forecast, columns=['pred'])], axis=1, sort=False) + date_range = [d.strftime('%Y-%m-%d') for d in pd.date_range(train_start, forecast_end)] + forecast_df['date'] = pd.Series(date_range).astype(str) + forecast_df[''] = None # Dates get messed up, so need to use pandas plotting + + # Save Model and file + print('... saving file:', forecast_file) + forecast_df.to_csv(os.path.join(data_dir, forecast_file)) + + plot_forecast(forecast_df, train, index_forecast, forecast, confint) - plot_forecast(forecast_df, train, index_forecast, forecast, confint) - -if __name__ == '__main__': - print('Training forecasting model...') - - train = trend_df[trend_df.date.isin(train_period)].cumulative_cases - index_forecast = [x for x in range(train.index[-1]+1, train.index[-1] + days_in_future+1)] - forecast(trend_df, train, index_forecast, days_in_future) + if __name__ == '__main__': + print('Training forecasting model...') + + train = trend_df[trend_df.date.isin(train_period)].cumulative_cases + index_forecast = [x for x in range(train.index[-1]+1, train.index[-1] + days_in_future+1)] + forecast(trend_df, train, index_forecast, days_in_future) From 746cb7831a12eec7a04ba49d119c3098fb5c71bc Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 16 Mar 2021 22:27:02 -0400 Subject: [PATCH 2/5] added aggregate design to forecast.py --- src/covidify/forecast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/covidify/forecast.py b/src/covidify/forecast.py index 7e0da88..60b66a5 100644 --- a/src/covidify/forecast.py +++ b/src/covidify/forecast.py @@ -61,7 +61,7 @@ os.system('mkdir -p ' + image_dir) class Forecast: - #Aggregate root + #Aggregate root for ref __init__ (tmp_df, train, index_forecast, forecast, confint, days_in_future): self.tmp_df = tmp_df self.train = train From 27a873dd4a74145f2df3ec369498492c48d82643 Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 16 Mar 2021 22:40:35 -0400 Subject: [PATCH 3/5] please let me fork again --- src/covidify/forecast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/covidify/forecast.py b/src/covidify/forecast.py index 60b66a5..2d819e0 100644 --- a/src/covidify/forecast.py +++ b/src/covidify/forecast.py @@ -69,7 +69,7 @@ class Forecast: self.forecast = forecast self.confint = confint self.days_in_future = days_in_future - + #please let me make another pull request def plot_forecast(tmp_df, train, index_forecast, forecast, confint): ''' Plot the values of train and test, the predictions from ARIMA and the shadowing From 9308bf8445c8b7ecd75bfacdc18d25b4db090b05 Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 16 Mar 2021 22:48:09 -0400 Subject: [PATCH 4/5] applied aggregate to forecast.py --- src/covidify/forecast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/covidify/forecast.py b/src/covidify/forecast.py index 2d819e0..d0f4164 100644 --- a/src/covidify/forecast.py +++ b/src/covidify/forecast.py @@ -69,7 +69,7 @@ class Forecast: self.forecast = forecast self.confint = confint self.days_in_future = days_in_future - #please let me make another pull request + #please let me make another pull request adfasdfadfs def plot_forecast(tmp_df, train, index_forecast, forecast, confint): ''' Plot the values of train and test, the predictions from ARIMA and the shadowing From cdb648924285b9149eef0e440844c9c98bde6264 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 14 Apr 2021 15:09:21 -0400 Subject: [PATCH 5/5] Implemented config.py as singleton --- src/covidify/config.py | 93 +++++++++++++++++++++++------------------- 1 file changed, 51 insertions(+), 42 deletions(-) diff --git a/src/covidify/config.py b/src/covidify/config.py index 9a2f960..24dd0f9 100644 --- a/src/covidify/config.py +++ b/src/covidify/config.py @@ -1,44 +1,53 @@ import os -# -# CLI -# -SCRIPT = '/pipeline.sh' -LIST_SCRIPT = '/pipeline.sh' - - -# -# DATA PREP -# -REPO = 'https://github.com/CSSEGISandData/COVID-19.git' -TMP_FOLDER = '/tmp/corona/' -TMP_GIT = os.path.join(TMP_FOLDER, REPO.split('/')[-1].split('.')[0]) -DATA = os.path.join(TMP_GIT, 'csse_covid_19_data', 'csse_covid_19_daily_reports') -LOG_TOP_N_COUNTRIES = 10 - - -# -# FORECASTING -# -DAYS_IN_FUTURE = 10 # Amount of days you want to forecast into future -PERC_SPLIT = 0.95 # Train / test split for forecasting model. - -# -# DATA VISUALIZATION -# -FIG_SIZE = (14,10) - - -#Github cols -KEEP_COLS = ['country', - 'province', - 'confirmed', - 'deaths', - 'recovered', - 'date', - 'datetime', - 'file_date'] - -NUMERIC_COLS = ['confirmed', - 'deaths', - 'recovered'] \ No newline at end of file +class SingletonConfig(object): + _instance = {} + """docstring for SingletonConfig""" + def __call__(sngl, *args, **kwargs): + if sngl not in sngl._instance: + cls._instance[sngl] = super(SingletonConfig, sngl).__call__(*args, **kwargs) + return sngl._instance[sngl] + + # + # CLI + # + SCRIPT = '/pipeline.sh' + LIST_SCRIPT = '/pipeline.sh' + + + # + # DATA PREP + # + REPO = 'https://github.com/CSSEGISandData/COVID-19.git' + TMP_FOLDER = '/tmp/corona/' + TMP_GIT = os.path.join(TMP_FOLDER, REPO.split('/')[-1].split('.')[0]) + DATA = os.path.join(TMP_GIT, 'csse_covid_19_data', 'csse_covid_19_daily_reports') + LOG_TOP_N_COUNTRIES = 10 + + + # + # FORECASTING + # + DAYS_IN_FUTURE = 10 # Amount of days you want to forecast into future + PERC_SPLIT = 0.95 # Train / test split for forecasting model. + + # + # DATA VISUALIZATION + # + FIG_SIZE = (14,10) + + + #Github cols + KEEP_COLS = ['country', + 'province', + 'confirmed', + 'deaths', + 'recovered', + 'date', + 'datetime', + 'file_date'] + + NUMERIC_COLS = ['confirmed', + 'deaths', + 'recovered'] +