-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathanalyse.py
More file actions
135 lines (97 loc) · 4.82 KB
/
analyse.py
File metadata and controls
135 lines (97 loc) · 4.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import pandas as pd
from upsetplot import plot
from matplotlib import pyplot as plt
import yaml
import geopandas
with open("config.yaml", "r") as file:
vmps = yaml.safe_load(file)["vmps"]
names_to_codes = {}
codes_to_names = {}
for vmp in vmps:
names_to_codes[vmp["name"]] = vmp["snomed_code"]
codes_to_names[vmp["snomed_code"]] = vmp["name"]
# Load the datasets
stores = pd.read_json("outputs/stores.json")
gdf = geopandas.GeoDataFrame(
stores, geometry=geopandas.points_from_xy(stores.long, stores.lat), crs="EPSG:4326"
)
df = pd.read_csv("outputs/stock_levels_all.csv", dtype={"productId": str})
df["has_stock"] = df["stockLevel"].isin(["G", "A"])
df["product_name"] = df.productId.apply(lambda x: codes_to_names[x])
def investigate():
utrogestan = df[df.productId == names_to_codes["Utrogestan 100mg"]]
lisdex = df[(df.productId == names_to_codes["Lisedexamfetamine 20mg caps"]) | (df.productId == names_to_codes["Lisedexamfetamine 30mg caps"])]
# * Which are the stores that do NOT have utrogestan and DO have lisdex?
no_utrogestan_stores = utrogestan[utrogestan.has_stock == False].storeId
lisdex_stores = lisdex[lisdex.has_stock == True].storeId
asd = df.loc[list(set(no_utrogestan_stores).intersection(set(lisdex_stores)))]
gdf[gdf.storeId.isin((asd.storeId.value_counts() > 1).index)]
breakpoint()
# plotting shows nothing interesting by location.
s = 4
# * Which are the stores that DO have utrogestan and NONE of the others?
# * There are about 50 which have BOTH lisdex but nothing else - who are they?
def time_dimension():
df = pd.read_csv('outputs/timeseries.csv', parse_dates=['datetime'])
df["product_name"] = df.productId.apply(lambda x: codes_to_names.get(str(x), "?"))
df['availability'] = df['A'] + df['G']
df.set_index('datetime', inplace=True)
grouped = df.groupby('product_name').resample('D')['availability'].sum().reset_index()
pivot_df = grouped.pivot(index='datetime', columns='product_name', values='availability')
pivot_df.plot(kind='line', figsize=(10, 6))
plt.title('Availability Over Time by product')
plt.ylabel('Availability (%)')
plt.xlabel('Date')
plt.legend(title='product')
plt.show()
def upsert_plot(with_stock=True):
"""Upsert plot to visualise co-occurence of products.
The theory is that one of these might be useful as a denominator
"""
# Initialize a DataFrame to capture whether each VMP is in stock in each store
store_vmp_matrix = pd.DataFrame()
# For each VMP, determine if it is in stock in each store and add this information to the matrix
for vmp in vmps:
df_vmp = df[df["productId"] == vmp["snomed_code"]]
df_vmp = df_vmp[["storeId", "has_stock"]].drop_duplicates().set_index("storeId")
if with_stock:
store_vmp_matrix[vmp["name"]] = df_vmp["has_stock"]
else:
store_vmp_matrix[vmp["name"]] = ~df_vmp["has_stock"]
if with_stock:
# Fill NaN values with False, indicating no stock for stores not listed with a particular VMP
store_vmp_matrix = store_vmp_matrix.fillna(False)
else:
store_vmp_matrix = store_vmp_matrix.fillna(True)
# Generate a binary key for each row to summarize its VMP stock status
store_vmp_matrix["combination_key"] = store_vmp_matrix.apply(
lambda row: "".join(row.astype(int).astype(str)), axis=1
)
# Count the occurrences of each combination
combination_counts = store_vmp_matrix["combination_key"].value_counts()
# Define VMP names in the same order as in combination keys
vmp_categories = [vmp["name"] for vmp in vmps]
# Convert the binary string index to a list of tuples representing the presence/absence of each VMP
def binary_str_to_presence_tuple(binary_str):
return tuple(bool(int(bit)) for bit in binary_str)
# Create a new DataFrame to hold the expanded binary string information
expanded_data = []
for binary_str, count in combination_counts.items():
presence_tuple = binary_str_to_presence_tuple(binary_str)
expanded_data.append((*presence_tuple, count))
# Create a DataFrame from the expanded data. This is human-readable
df_expanded = pd.DataFrame(expanded_data, columns=[*vmp_categories, "count"])
# Now, create the MultiIndex from the columns representing VMPs
multi_index = pd.MultiIndex.from_frame(df_expanded[vmp_categories])
# Create a new Series with this MultiIndex and the counts as values
upset_data = pd.Series(df_expanded["count"].values, index=multi_index)
# Plot using UpSet
upset = plot(upset_data)
if with_stock:
plt.title("Overlap of Stores with Stock of Different VMPs")
else:
plt.title("Overlap of Stores without Stock of Different VMPs")
plt.show()
#do_plot(with_stock=False)
#investigate()
time_dimension()