Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 76 additions & 37 deletions src/covidify/data_prep.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
import pandas as pd
from string import capwords
from difflib import get_close_matches
from datetime import datetime, date, time

from datetime import datetime, date, time
import abc
from covidify.sources import github, wiki
from covidify.config import REPO, TMP_FOLDER, TMP_GIT, DATA
from covidify.utils.utils import replace_arg_score
Expand All @@ -43,49 +43,46 @@

if source == 'JHU':
df = github.get()

elif source == 'wiki':
print('Apologies, the wikipedia source is not ready yet - getting github data')
df = github.get()



############ COUNTRY SELECTION ############

def get_similar_countries(c, country_list):
pos_countries = get_close_matches(c, country_list)

if len(pos_countries) > 0:
print('\033[1;31m'+c, 'was not listed. did you mean', pos_countries[0].capitalize() + '?\033[0;0m')

#Only delete if its a covidify generated folder
if 'Desktop/covidify-output-' in out:
os.system('rm -rf ' + out)
sys.exit(1)
else:
print('\033[1;31m'+c, 'was not listed.\033[0;0m')
if 'Desktop/covidify-output-' in out:
os.system('rm -rf ' + out)
sys.exit(1)

def check_specified_country(df, country):
class State (metaclass= abc.ABCMeta):
'''
State interface
'''

def __init__(self):
pass

@abc.abstractmethod
def check_specified_country(self, df, country):
pass

class CountryGivenState(State):
'''
let user filter reports by country, if not found
then give a option if the string is similar
State to be used if user gives a country
'''

# Get all unique countries in the data
country_list = list(map(lambda x:x.lower().strip(), set(df.country.values)))

if country:
def __init__(self):
pass

def check_specified_country(self, df, country):
# Get all unique countries in the data
country_list = list(map(lambda x:x.lower().strip(), set(df.country.values)))

print('Country specified!')
if country.lower() == 'Mainland China': #Mainland china and china doesn't come up as similar
print(country, 'was not listed. did you mean China?')
sys.exit(1)
# give similar option if similarity found
if country.lower() not in country_list:
get_similar_countries(country, country_list)

else:
#Return filtered dataframe
print('... filtering data for', country)
Expand All @@ -94,11 +91,53 @@ def check_specified_country(df, country):
else:
df = df[df.country == capwords(country)]
return df
else:

def get_similar_countries(c, country_list):
pos_countries = get_close_matches(c, country_list)

if len(pos_countries) > 0:
print('\033[1;31m'+c, 'was not listed. did you mean', pos_countries[0].capitalize() + '?\033[0;0m')

#Only delete if its a covidify generated folder
if 'Desktop/covidify-output-' in out:
os.system('rm -rf ' + out)
sys.exit(1)
else:
print('\033[1;31m'+c, 'was not listed.\033[0;0m')
if 'Desktop/covidify-output-' in out:
os.system('rm -rf ' + out)
sys.exit(1)

class GlobalCountryState(State):
'''
State to be used if no country is given
'''

def __init__(self):
pass

def check_specified_country(self, df, country):
print('... No specific country specified')
return df

df = check_specified_country(df, country)

class Country:
'''
Context class for different states
This class should be the one that is instantiated and called upon
'''

def __init__(self, country):
if country:
self._state = CountryGivenState()
else:
self._state = GlobalCountryState()

def check_specified_country(self, df, country):
self._state.check_specified_country(df, country)

context_country = Country(country)
df = context_country.check_specified_country(df, country)

############ DAILY CASES ############

Expand Down Expand Up @@ -170,9 +209,9 @@ def get_top_countries(data):
# Get top N infected countries
tmp_df = data.copy()
tmp_df = tmp_df[tmp_df.file_date == df.file_date.max()]
return tmp_df.groupby(['country']).agg({'confirmed': 'sum'}).sort_values('confirmed',ascending=False).head(top).index
TOP_N_COUNTRIES = get_top_countries(df)
return tmp_df.groupby(['country']).agg({'confirmed': 'sum'}).sort_values('confirmed',ascending=False).head(top).index

TOP_N_COUNTRIES = get_top_countries(df)

tmp_df = df[df.country.isin(TOP_N_COUNTRIES)].copy()

Expand All @@ -188,18 +227,18 @@ def get_day_counts(d, country):
'deaths': 'sum'})
result_df['date'] = data['file_date'].unique()
result_df['country'] = country

result_df = result_df[result_df.confirmed >= 500]
result_df.insert(loc=0, column='day', value=np.arange(len(result_df)))
return result_df

df_list = []

for country in TOP_N_COUNTRIES:
print(' ...', country + ': ' + str(tmp_df[(tmp_df.file_date == df.file_date.max()) &
print(' ...', country + ': ' + str(tmp_df[(tmp_df.file_date == df.file_date.max()) &
(tmp_df.country == country)].confirmed.sum()))
df_list.append(get_day_counts(tmp_df[tmp_df.country == country], country))

log_df = pd.concat(df_list, axis=0, ignore_index=True)


Expand Down Expand Up @@ -227,4 +266,4 @@ def get_day_counts(d, country):
log_df.astype(str).to_csv(os.path.join(save_dir, log_file_name))
print('...', log_file_name)

print('Done!')
print('Done!')