Skip to content

Commit 51d7e6e

Browse files
committed
updates
1 parent 7e0690e commit 51d7e6e

7 files changed

Lines changed: 85 additions & 21 deletions

File tree

.github/workflows/ci.yml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
name: CI
2+
on: [push, pull_request]
3+
jobs:
4+
install-and-smoke:
5+
runs-on: ubuntu-latest
6+
steps:
7+
- uses: actions/checkout@v4
8+
- uses: actions/setup-python@v5
9+
with:
10+
python-version: '3.12'
11+
- run: pip install -r requirements.txt
12+
- run: python - << 'PY'
13+
import importlib, sys
14+
for m in ["pandas","numpy","streamlit","openpyxl"]:
15+
importlib.import_module(m)
16+
print("Deps OK")
17+
PY
18+

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,7 @@ venv/
1313
# OS
1414
.DS_Store
1515

16+
*~
1617

18+
# Always keep GitHub workflows
19+
!.github/

DATA_LICENSE.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
This project displays derivative data from:
2+
- GBIF.org – CC0/CC-BY/CC-BY-NC (varies by dataset)
3+
- Laji.fi – CC-BY
4+
- COCONUT.naturalproducts.net – CC0
5+
6+
The **data** remains subject to the original licenses and terms of use from the respective providers listed above.

README.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Aurora
1+
# Aurora app
22

33
[![Streamlit](https://img.shields.io/badge/Built%20with-Streamlit-ff4b4b.svg)](https://streamlit.io/)
44

@@ -23,4 +23,11 @@ pip install -r requirements.txt
2323
# run the app
2424
streamlit run aurora/app.py
2525

26+
## Data Sources & Attribution
27+
28+
- [COCONUT](https://coconut.naturalproducts.net/) (Collection of Open Natural Products database) - CC0 license
29+
- [Laji.fi](https://laji.fi/) (Finnish Biodiversity Information Facility) - CC-BY license
30+
- [GBIF](https://www.gbif.org/) (Global Biodiversity Information Facility) - CC0/CC-BY/CC-BY-NC licenses (depending on the dataset)
31+
32+
All rights and data terms respected according to source guidelines.
2633

app.py

Lines changed: 49 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,13 @@
1414
import pandas as pd
1515
import streamlit as st
1616

17+
import logging
18+
19+
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s")
20+
log = logging.getLogger("aurora")
21+
log.info("App started")
22+
23+
1724
DEFAULT_COMPOUND = "arctigenin"
1825

1926
DATA_DIR = "data"
@@ -105,22 +112,39 @@ def is_smiles(smiles_string: str) -> bool:
105112
return True
106113

107114
########################################################################################
108-
@st.cache_data
115+
@st.cache_data(show_spinner=True, ttl=3600)
109116
def load_data():
110-
print("Processing short plant list of plants genera...")
117+
118+
required = {
119+
"COCONUT_DB_PATH": COCONUT_DB_PATH,
120+
"LAJI_DB_PATH": LAJI_DB_PATH,
121+
"GBIF_DB_PATH": GBIF_DB_PATH,
122+
"LIST_PLANTS_GENERA_PATH": LIST_PLANTS_GENERA_PATH,
123+
}
124+
missing = []
125+
for label, p in required.items():
126+
if not os.path.exists(p) or os.path.getsize(p) == 0:
127+
missing.append(f"{label}{p}")
128+
129+
if missing:
130+
st.error("Required data files are missing or empty:\n- " + "\n- ".join(missing))
131+
st.stop()
132+
133+
log.info("Processing short plant list of plants genera...")
134+
111135
plants_genera = set()
112136
with open(LIST_PLANTS_GENERA_PATH, "r") as f:
113137
plants_genera = set([e.lower().rstrip("\r\n") for e in f if e.rstrip("\r\n")])
114138

115-
print("Processing Laji.fi database information...")
139+
log.info("Processing Laji.fi database information...")
116140
laji = pd.read_csv(LAJI_DB_PATH, sep="\t", low_memory=False)
117141
laji["name"] = laji["Scientific name"].str.lower()
118142
laji = laji[["name", "Identifier", "Observation count from Finland", "Genus, Scientific name"]].copy()
119143
laji.columns = ["name", "identifier_laji", "obs. in Finland (laji)", "genus_laji"]
120144
laji["genus_laji"] = laji["genus_laji"].str.lower()
121145
laji = laji.dropna(subset=["name"]).drop_duplicates()
122146

123-
print("Processing GBIF database information...")
147+
log.info("Processing GBIF database information...")
124148
gbif = pd.read_csv(GBIF_DB_PATH, sep="\t", low_memory=False)
125149
gbif = gbif[
126150
["canonicalName", "genus", "obs_FI", "obs_NO", "count_FI_60N", "count_NO_60N", "count_FI_66N", "count_NO_66N", "genusKey", "speciesKey"]
@@ -162,7 +186,7 @@ def load_data():
162186
laji_gbif["url"] = laji_gbif["url_laji"].fillna(laji_gbif["url_gbif"])
163187
laji_gbif = laji_gbif.drop(columns=["identifier_laji", "genusKey_gbif", "speciesKey_gbif", "url_laji", "url_gbif"])
164188

165-
print("Processing Coconut database information...")
189+
log.info("Processing Coconut database information...")
166190
coconut = pd.read_csv(COCONUT_DB_PATH, sep="\t", low_memory=False)
167191
coconut = coconut.dropna(subset=["name", "identifier"])
168192
coconut = coconut.drop(columns=["identifier"])
@@ -187,12 +211,12 @@ def analyse(compound: str = "arctigenin", smile: str = "",genus: bool = False) -
187211
return None, None, None
188212

189213
# Determine search mode and get initial data
190-
print("Analyse ->",compound)
191-
print("Analyse->",smile)
192-
print("Analyse->", genus)
214+
log.info("Analyse -> %s",compound)
215+
log.info("Analyse -> %s",smile)
216+
log.info("Analyse -> %s",genus)
193217
flag = False
194218
if smile:
195-
print(f"Analysing SMILES '{smile}' (genus={genus})...")
219+
log.info(f"Analysing SMILES '{smile}' (genus={genus})...")
196220
# Filter coconut for the SMILES
197221
res = coco[coco["canonical_smiles"] == smile].copy()
198222
if res.empty:
@@ -212,7 +236,7 @@ def analyse(compound: str = "arctigenin", smile: str = "",genus: bool = False) -
212236
flag = True
213237

214238
if compound and not flag:
215-
print(f"Analysing compound '{compound}' (genus={genus})...")
239+
log.info(f"Analysing compound '{compound}' (genus={genus})...")
216240
# Filter coconut for the compound
217241
res = coco[coco["name"] == compound].copy()
218242
if res.empty:
@@ -223,23 +247,23 @@ def analyse(compound: str = "arctigenin", smile: str = "",genus: bool = False) -
223247
compound = res["name"].iloc[0]
224248
smiles = res["canonical_smiles"].iloc[0]
225249

226-
print("Found -> Compound:", compound)
227-
print("Found -> Smiles:", smiles)
250+
log.info("Found -> Compound: %s", compound)
251+
log.info("Found -> Smiles: %s", smiles)
228252

229253
org = sorted(set([e.lower().strip() for e in org if e]))
230254
# keep only plants
231255
org = [e for e in org if infer_genus(e) in plants]
232256
if not org:
233-
print("WARNING: No organisms found!")
257+
log.info("WARNING: No organisms found!")
234258
return pd.DataFrame(), pd.DataFrame(), compound # Return empty DFs but the found compound name
235-
print("Organisms:", len(org))
259+
log.info("Organisms: %d", len(org))
236260

237261
if genus:
238262
# use genus to make the search wider
239263
genera = [infer_genus(e) for e in org]
240264
genera = set([e for e in genera if e])
241265
if not genera:
242-
print("WARNING: No genera found for organisms!")
266+
log.info("WARNING: No genera found for organisms!")
243267
return pd.DataFrame(), pd.DataFrame(), compound # Return empty DFs
244268
genera = pd.DataFrame({"genus": sorted(genera)})
245269
genera = pd.merge(genera, db, how="left", left_on="genus", right_on="genus")
@@ -248,7 +272,7 @@ def analyse(compound: str = "arctigenin", smile: str = "",genus: bool = False) -
248272

249273
res = pd.DataFrame({"organism": org})
250274

251-
print("Processing Laji & GBIF database information...")
275+
log.info("Processing Laji & GBIF database information...")
252276
res = pd.merge(res, db, how="left", left_on="organism", right_on="name")
253277
res = res.drop(columns=["name"])
254278
res = res.dropna(subset=["genus"])
@@ -313,6 +337,12 @@ def paginate_df(df: pd.DataFrame, page_size: int = RESULTS_PAGE_SIZE):
313337

314338
def df_to_xlsx_bytes(df: pd.DataFrame, sheet_name: str = "Sheet1") -> bytes:
315339
"""Return an .xlsx file (as bytes) for the given DataFrame."""
340+
try:
341+
import openpyxl # ensure dependency exists at runtime
342+
except Exception as e:
343+
st.warning("Excel export not available (openpyxl missing).")
344+
log.warning("openpyxl import failed: %s", e)
345+
return b""
316346
bio = io.BytesIO()
317347
with pd.ExcelWriter(bio, engine="openpyxl") as writer:
318348
df.to_excel(writer, index=False, sheet_name=sheet_name)
@@ -391,8 +421,8 @@ def mk_link(row):
391421
use_genus = association == "genus"
392422
search_term = st.session_state.compound_input
393423

394-
print("Search ->",search_term)
395-
print("Search ->",is_smiles(search_term))
424+
log.info("Search -> %s",search_term)
425+
log.info("Search -> %s",is_smiles(search_term))
396426
# Check if the input is a SMILES string or a compound name
397427
if is_smiles(search_term):
398428
results, summary, found_compound_name = analyse(compound="", smile=search_term, genus=use_genus)
@@ -436,7 +466,7 @@ def mk_link(row):
436466
]
437467
for c in int_cols:
438468
if c in results_download.columns:
439-
results_download[c] = results_download[c].astype("Int64")
469+
results_download[c] = pd.to_numeric(results_download[c], errors="coerce").astype("Int64")
440470

441471
header_with_download(
442472
"Results",

requirements.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
21
# Core
32
pandas==2.1.4
43
numpy==1.26.4

runtime.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
3.12

0 commit comments

Comments
 (0)