updates

ndaniel · ndaniel · commit 51d7e6edb7da · 2025-10-01T14:59:27.000+03:00
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,18 @@
+name: CI
+on: [push, pull_request]
+jobs:
+  install-and-smoke:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+      - run: pip install -r requirements.txt
+      - run: python - << 'PY'
+import importlib, sys
+for m in ["pandas","numpy","streamlit","openpyxl"]:
+    importlib.import_module(m)
+print("Deps OK")
+PY
+
diff --git a/.gitignore b/.gitignore
@@ -13,4 +13,7 @@ venv/
 # OS
 .DS_Store
 
+*~
 
+# Always keep GitHub workflows
+!.github/
diff --git a/DATA_LICENSE.md b/DATA_LICENSE.md
@@ -0,0 +1,6 @@
+This project displays derivative data from:
+- GBIF.org – CC0/CC-BY/CC-BY-NC (varies by dataset)
+- Laji.fi – CC-BY
+- COCONUT.naturalproducts.net – CC0
+
+The **data** remains subject to the original licenses and terms of use from the respective providers listed above.
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# Aurora
+# Aurora app
 
 [![Streamlit](https://img.shields.io/badge/Built%20with-Streamlit-ff4b4b.svg)](https://streamlit.io/)
 
@@ -23,4 +23,11 @@ pip install -r requirements.txt
 # run the app
 streamlit run aurora/app.py
 
+## Data Sources & Attribution
+
+- [COCONUT](https://coconut.naturalproducts.net/) (Collection of Open Natural Products database) - CC0 license
+- [Laji.fi](https://laji.fi/) (Finnish Biodiversity Information Facility) - CC-BY license
+- [GBIF](https://www.gbif.org/) (Global Biodiversity Information Facility) - CC0/CC-BY/CC-BY-NC licenses (depending on the dataset)
+
+All rights and data terms respected according to source guidelines.
 
diff --git a/app.py b/app.py
@@ -14,6 +14,13 @@
 import pandas as pd
 import streamlit as st
 
+import logging
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s")
+log = logging.getLogger("aurora")
+log.info("App started")
+
+
 DEFAULT_COMPOUND = "arctigenin"
 
 DATA_DIR = "data"
@@ -105,22 +112,39 @@ def is_smiles(smiles_string: str) -> bool:
     return True
 
 ########################################################################################
-@st.cache_data
+@st.cache_data(show_spinner=True, ttl=3600)
 def load_data():
-    print("Processing short plant list of plants genera...")
+
+    required = {
+        "COCONUT_DB_PATH": COCONUT_DB_PATH,
+        "LAJI_DB_PATH": LAJI_DB_PATH,
+        "GBIF_DB_PATH": GBIF_DB_PATH,
+        "LIST_PLANTS_GENERA_PATH": LIST_PLANTS_GENERA_PATH,
+    }
+    missing = []
+    for label, p in required.items():
+        if not os.path.exists(p) or os.path.getsize(p) == 0:
+            missing.append(f"{label} → {p}")
+
+    if missing:
+        st.error("Required data files are missing or empty:\n- " + "\n- ".join(missing))
+        st.stop()
+
+    log.info("Processing short plant list of plants genera...")
+
     plants_genera = set()
     with open(LIST_PLANTS_GENERA_PATH, "r") as f:
         plants_genera = set([e.lower().rstrip("\r\n") for e in f if e.rstrip("\r\n")])
 
-    print("Processing Laji.fi database information...")
+    log.info("Processing Laji.fi database information...")
     laji = pd.read_csv(LAJI_DB_PATH, sep="\t", low_memory=False)
     laji["name"] = laji["Scientific name"].str.lower()
     laji = laji[["name", "Identifier", "Observation count from Finland", "Genus, Scientific name"]].copy()
     laji.columns = ["name", "identifier_laji", "obs. in Finland (laji)", "genus_laji"]
     laji["genus_laji"] = laji["genus_laji"].str.lower()
     laji = laji.dropna(subset=["name"]).drop_duplicates()
 
-    print("Processing GBIF database information...")
+    log.info("Processing GBIF database information...")
     gbif = pd.read_csv(GBIF_DB_PATH, sep="\t", low_memory=False)
     gbif = gbif[
         ["canonicalName", "genus", "obs_FI", "obs_NO", "count_FI_60N", "count_NO_60N", "count_FI_66N", "count_NO_66N", "genusKey", "speciesKey"]
@@ -162,7 +186,7 @@ def load_data():
     laji_gbif["url"] = laji_gbif["url_laji"].fillna(laji_gbif["url_gbif"])
     laji_gbif = laji_gbif.drop(columns=["identifier_laji", "genusKey_gbif", "speciesKey_gbif", "url_laji", "url_gbif"])
 
-    print("Processing Coconut database information...")
+    log.info("Processing Coconut database information...")
     coconut = pd.read_csv(COCONUT_DB_PATH, sep="\t", low_memory=False)
     coconut = coconut.dropna(subset=["name", "identifier"])
     coconut = coconut.drop(columns=["identifier"])
@@ -187,12 +211,12 @@ def analyse(compound: str = "arctigenin", smile: str = "",genus: bool = False) -
         return None, None, None
 
     # Determine search mode and get initial data
-    print("Analyse ->",compound)
-    print("Analyse->",smile)
-    print("Analyse->", genus)
+    log.info("Analyse -> %s",compound)
+    log.info("Analyse -> %s",smile)
+    log.info("Analyse -> %s",genus)
     flag = False
     if smile:
-        print(f"Analysing SMILES '{smile}' (genus={genus})...")
+        log.info(f"Analysing SMILES '{smile}' (genus={genus})...")
         # Filter coconut for the SMILES
         res = coco[coco["canonical_smiles"] == smile].copy()
         if res.empty:
@@ -212,7 +236,7 @@ def analyse(compound: str = "arctigenin", smile: str = "",genus: bool = False) -
             flag = True
     
     if compound and not flag:
-        print(f"Analysing compound '{compound}' (genus={genus})...")
+        log.info(f"Analysing compound '{compound}' (genus={genus})...")
         # Filter coconut for the compound
         res = coco[coco["name"] == compound].copy()
         if res.empty:
@@ -223,23 +247,23 @@ def analyse(compound: str = "arctigenin", smile: str = "",genus: bool = False) -
         compound = res["name"].iloc[0]
         smiles = res["canonical_smiles"].iloc[0]
 
-    print("Found -> Compound:", compound)
-    print("Found -> Smiles:", smiles)
+    log.info("Found -> Compound: %s", compound)
+    log.info("Found -> Smiles: %s", smiles)
 
     org = sorted(set([e.lower().strip() for e in org if e]))
     # keep only plants
     org = [e for e in org if infer_genus(e) in plants]
     if not org:
-        print("WARNING: No organisms found!")
+        log.info("WARNING: No organisms found!")
         return pd.DataFrame(), pd.DataFrame(), compound # Return empty DFs but the found compound name
-    print("Organisms:", len(org))
+    log.info("Organisms: %d", len(org))
 
     if genus:
         # use genus to make the search wider
         genera = [infer_genus(e) for e in org]
         genera = set([e for e in genera if e])
         if not genera:
-            print("WARNING: No genera found for organisms!")
+            log.info("WARNING: No genera found for organisms!")
             return pd.DataFrame(), pd.DataFrame(), compound # Return empty DFs
         genera = pd.DataFrame({"genus": sorted(genera)})
         genera = pd.merge(genera, db, how="left", left_on="genus", right_on="genus")
@@ -248,7 +272,7 @@ def analyse(compound: str = "arctigenin", smile: str = "",genus: bool = False) -
 
     res = pd.DataFrame({"organism": org})
 
-    print("Processing Laji & GBIF database information...")
+    log.info("Processing Laji & GBIF database information...")
     res = pd.merge(res, db, how="left", left_on="organism", right_on="name")
     res = res.drop(columns=["name"])
     res = res.dropna(subset=["genus"])
@@ -313,6 +337,12 @@ def paginate_df(df: pd.DataFrame, page_size: int = RESULTS_PAGE_SIZE):
 
 def df_to_xlsx_bytes(df: pd.DataFrame, sheet_name: str = "Sheet1") -> bytes:
     """Return an .xlsx file (as bytes) for the given DataFrame."""
+    try:
+        import openpyxl  # ensure dependency exists at runtime
+    except Exception as e:
+        st.warning("Excel export not available (openpyxl missing).")
+        log.warning("openpyxl import failed: %s", e)
+        return b""
     bio = io.BytesIO()
     with pd.ExcelWriter(bio, engine="openpyxl") as writer:
         df.to_excel(writer, index=False, sheet_name=sheet_name)
@@ -391,8 +421,8 @@ def mk_link(row):
     use_genus = association == "genus"
     search_term = st.session_state.compound_input
 
-    print("Search ->",search_term)
-    print("Search ->",is_smiles(search_term))
+    log.info("Search -> %s",search_term)
+    log.info("Search -> %s",is_smiles(search_term))
     # Check if the input is a SMILES string or a compound name
     if is_smiles(search_term):
         results, summary, found_compound_name = analyse(compound="", smile=search_term, genus=use_genus)
@@ -436,7 +466,7 @@ def mk_link(row):
     ]
     for c in int_cols:
         if c in results_download.columns:
-            results_download[c] = results_download[c].astype("Int64")
+            results_download[c] = pd.to_numeric(results_download[c], errors="coerce").astype("Int64")
 
     header_with_download(
         "Results",
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,3 @@
-
 # Core
 pandas==2.1.4
 numpy==1.26.4
diff --git a/runtime.txt b/runtime.txt
@@ -0,0 +1 @@
+3.12

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,3 @@`
`1`		`-`
`2`	`1`	`# Core`
`3`	`2`	`pandas==2.1.4`
`4`	`3`	`numpy==1.26.4`