1414import pandas as pd
1515import streamlit as st
1616
17+ import logging
18+
19+ logging .basicConfig (level = logging .INFO , format = "%(asctime)s %(levelname)s: %(message)s" )
20+ log = logging .getLogger ("aurora" )
21+ log .info ("App started" )
22+
23+
1724DEFAULT_COMPOUND = "arctigenin"
1825
1926DATA_DIR = "data"
@@ -105,22 +112,39 @@ def is_smiles(smiles_string: str) -> bool:
105112 return True
106113
107114########################################################################################
108- @st .cache_data
115+ @st .cache_data ( show_spinner = True , ttl = 3600 )
109116def load_data ():
110- print ("Processing short plant list of plants genera..." )
117+
118+ required = {
119+ "COCONUT_DB_PATH" : COCONUT_DB_PATH ,
120+ "LAJI_DB_PATH" : LAJI_DB_PATH ,
121+ "GBIF_DB_PATH" : GBIF_DB_PATH ,
122+ "LIST_PLANTS_GENERA_PATH" : LIST_PLANTS_GENERA_PATH ,
123+ }
124+ missing = []
125+ for label , p in required .items ():
126+ if not os .path .exists (p ) or os .path .getsize (p ) == 0 :
127+ missing .append (f"{ label } → { p } " )
128+
129+ if missing :
130+ st .error ("Required data files are missing or empty:\n - " + "\n - " .join (missing ))
131+ st .stop ()
132+
133+ log .info ("Processing short plant list of plants genera..." )
134+
111135 plants_genera = set ()
112136 with open (LIST_PLANTS_GENERA_PATH , "r" ) as f :
113137 plants_genera = set ([e .lower ().rstrip ("\r \n " ) for e in f if e .rstrip ("\r \n " )])
114138
115- print ("Processing Laji.fi database information..." )
139+ log . info ("Processing Laji.fi database information..." )
116140 laji = pd .read_csv (LAJI_DB_PATH , sep = "\t " , low_memory = False )
117141 laji ["name" ] = laji ["Scientific name" ].str .lower ()
118142 laji = laji [["name" , "Identifier" , "Observation count from Finland" , "Genus, Scientific name" ]].copy ()
119143 laji .columns = ["name" , "identifier_laji" , "obs. in Finland (laji)" , "genus_laji" ]
120144 laji ["genus_laji" ] = laji ["genus_laji" ].str .lower ()
121145 laji = laji .dropna (subset = ["name" ]).drop_duplicates ()
122146
123- print ("Processing GBIF database information..." )
147+ log . info ("Processing GBIF database information..." )
124148 gbif = pd .read_csv (GBIF_DB_PATH , sep = "\t " , low_memory = False )
125149 gbif = gbif [
126150 ["canonicalName" , "genus" , "obs_FI" , "obs_NO" , "count_FI_60N" , "count_NO_60N" , "count_FI_66N" , "count_NO_66N" , "genusKey" , "speciesKey" ]
@@ -162,7 +186,7 @@ def load_data():
162186 laji_gbif ["url" ] = laji_gbif ["url_laji" ].fillna (laji_gbif ["url_gbif" ])
163187 laji_gbif = laji_gbif .drop (columns = ["identifier_laji" , "genusKey_gbif" , "speciesKey_gbif" , "url_laji" , "url_gbif" ])
164188
165- print ("Processing Coconut database information..." )
189+ log . info ("Processing Coconut database information..." )
166190 coconut = pd .read_csv (COCONUT_DB_PATH , sep = "\t " , low_memory = False )
167191 coconut = coconut .dropna (subset = ["name" , "identifier" ])
168192 coconut = coconut .drop (columns = ["identifier" ])
@@ -187,12 +211,12 @@ def analyse(compound: str = "arctigenin", smile: str = "",genus: bool = False) -
187211 return None , None , None
188212
189213 # Determine search mode and get initial data
190- print ("Analyse ->" ,compound )
191- print ("Analyse-> " ,smile )
192- print ("Analyse->" , genus )
214+ log . info ("Analyse -> %s " ,compound )
215+ log . info ("Analyse -> %s " ,smile )
216+ log . info ("Analyse -> %s" , genus )
193217 flag = False
194218 if smile :
195- print (f"Analysing SMILES '{ smile } ' (genus={ genus } )..." )
219+ log . info (f"Analysing SMILES '{ smile } ' (genus={ genus } )..." )
196220 # Filter coconut for the SMILES
197221 res = coco [coco ["canonical_smiles" ] == smile ].copy ()
198222 if res .empty :
@@ -212,7 +236,7 @@ def analyse(compound: str = "arctigenin", smile: str = "",genus: bool = False) -
212236 flag = True
213237
214238 if compound and not flag :
215- print (f"Analysing compound '{ compound } ' (genus={ genus } )..." )
239+ log . info (f"Analysing compound '{ compound } ' (genus={ genus } )..." )
216240 # Filter coconut for the compound
217241 res = coco [coco ["name" ] == compound ].copy ()
218242 if res .empty :
@@ -223,23 +247,23 @@ def analyse(compound: str = "arctigenin", smile: str = "",genus: bool = False) -
223247 compound = res ["name" ].iloc [0 ]
224248 smiles = res ["canonical_smiles" ].iloc [0 ]
225249
226- print ("Found -> Compound:" , compound )
227- print ("Found -> Smiles:" , smiles )
250+ log . info ("Found -> Compound: %s " , compound )
251+ log . info ("Found -> Smiles: %s " , smiles )
228252
229253 org = sorted (set ([e .lower ().strip () for e in org if e ]))
230254 # keep only plants
231255 org = [e for e in org if infer_genus (e ) in plants ]
232256 if not org :
233- print ("WARNING: No organisms found!" )
257+ log . info ("WARNING: No organisms found!" )
234258 return pd .DataFrame (), pd .DataFrame (), compound # Return empty DFs but the found compound name
235- print ("Organisms:" , len (org ))
259+ log . info ("Organisms: %d " , len (org ))
236260
237261 if genus :
238262 # use genus to make the search wider
239263 genera = [infer_genus (e ) for e in org ]
240264 genera = set ([e for e in genera if e ])
241265 if not genera :
242- print ("WARNING: No genera found for organisms!" )
266+ log . info ("WARNING: No genera found for organisms!" )
243267 return pd .DataFrame (), pd .DataFrame (), compound # Return empty DFs
244268 genera = pd .DataFrame ({"genus" : sorted (genera )})
245269 genera = pd .merge (genera , db , how = "left" , left_on = "genus" , right_on = "genus" )
@@ -248,7 +272,7 @@ def analyse(compound: str = "arctigenin", smile: str = "",genus: bool = False) -
248272
249273 res = pd .DataFrame ({"organism" : org })
250274
251- print ("Processing Laji & GBIF database information..." )
275+ log . info ("Processing Laji & GBIF database information..." )
252276 res = pd .merge (res , db , how = "left" , left_on = "organism" , right_on = "name" )
253277 res = res .drop (columns = ["name" ])
254278 res = res .dropna (subset = ["genus" ])
@@ -313,6 +337,12 @@ def paginate_df(df: pd.DataFrame, page_size: int = RESULTS_PAGE_SIZE):
313337
314338def df_to_xlsx_bytes (df : pd .DataFrame , sheet_name : str = "Sheet1" ) -> bytes :
315339 """Return an .xlsx file (as bytes) for the given DataFrame."""
340+ try :
341+ import openpyxl # ensure dependency exists at runtime
342+ except Exception as e :
343+ st .warning ("Excel export not available (openpyxl missing)." )
344+ log .warning ("openpyxl import failed: %s" , e )
345+ return b""
316346 bio = io .BytesIO ()
317347 with pd .ExcelWriter (bio , engine = "openpyxl" ) as writer :
318348 df .to_excel (writer , index = False , sheet_name = sheet_name )
@@ -391,8 +421,8 @@ def mk_link(row):
391421 use_genus = association == "genus"
392422 search_term = st .session_state .compound_input
393423
394- print ("Search ->" ,search_term )
395- print ("Search ->" ,is_smiles (search_term ))
424+ log . info ("Search -> %s " ,search_term )
425+ log . info ("Search -> %s " ,is_smiles (search_term ))
396426 # Check if the input is a SMILES string or a compound name
397427 if is_smiles (search_term ):
398428 results , summary , found_compound_name = analyse (compound = "" , smile = search_term , genus = use_genus )
@@ -436,7 +466,7 @@ def mk_link(row):
436466 ]
437467 for c in int_cols :
438468 if c in results_download .columns :
439- results_download [c ] = results_download [c ].astype ("Int64" )
469+ results_download [c ] = pd . to_numeric ( results_download [c ], errors = "coerce" ) .astype ("Int64" )
440470
441471 header_with_download (
442472 "Results" ,
0 commit comments