-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconfig.py
More file actions
118 lines (98 loc) · 4.02 KB
/
config.py
File metadata and controls
118 lines (98 loc) · 4.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
"""
Configuration module for Bibliometric Analysis Pipeline.
This module handles:
1. Environment variables (API Keys).
2. Directory paths.
3. Loading of domain-specific configuration (Journals, Buzzwords) from CSV files.
"""
import os
import pandas as pd
import sys
# --- 1. Environment & API Keys ---
# Try to load from environment variables, or fallback to manual input if needed.
# Users should preferably set these in their environment or a .env file.
ELSEVIER_API_KEY = os.getenv("ELSEVIER_API_KEY", "")
INST_TOKEN = os.getenv("INST_TOKEN", "")
if not ELSEVIER_API_KEY:
print("WARNING: ELSEVIER_API_KEY is not set. API calls will fail.")
# --- 2. Directory Paths ---
# Determine the absolute path of the project root
# Determine the absolute path of the script directory
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
# Define Data and Results directories relative to this script folder
# This makes the "bibliometric_analysis" folder self-contained.
DATA_DIR = os.path.join(BASE_DIR, "data")
RAW_DATA_DIR = os.path.join(DATA_DIR, "raw")
PROCESSED_DATA_DIR = os.path.join(DATA_DIR, "processed")
RESULTS_DIR = os.path.join(BASE_DIR, "results")
FIGURES_DIR = os.path.join(RESULTS_DIR, "figures")
# Ensure directories exist
for d in [DATA_DIR, RAW_DATA_DIR, PROCESSED_DATA_DIR, RESULTS_DIR, FIGURES_DIR]:
os.makedirs(d, exist_ok=True)
# Input Files
JOURNALS_CSV = os.path.join(DATA_DIR, "journals.csv")
BUZZWORDS_CSV = os.path.join(DATA_DIR, "technical_buzzwords.csv")
# --- 3. Analysis Parameters ---
CONTEXT_WINDOW_SIZE = 75 # Characters before/after keyword match
# --- 4. Data Loading Helpers ---
def load_journals():
"""
Loads the list of target journals from CSV.
Returns:
list: A list of journal names.
"""
if not os.path.exists(JOURNALS_CSV):
print(f"WARNING: Journals file not found at {JOURNALS_CSV}")
return []
try:
df = pd.read_csv(JOURNALS_CSV)
# Assuming column 'journal' exists based on previous analysis
if 'journal' in df.columns:
return df['journal'].dropna().unique().tolist()
else:
print(f"WARNING: 'journal' column not found in {JOURNALS_CSV}")
return []
except Exception as e:
print(f"ERROR loading journals: {e}")
return []
def load_buzzwords():
"""
Loads technical buzzwords and their terms.
Returns:
dict: { 'Major Topic': ['term1', 'term2', ...], ... }
"""
if not os.path.exists(BUZZWORDS_CSV):
print(f"WARNING: Buzzwords file not found at {BUZZWORDS_CSV}")
return {}
try:
df = pd.read_csv(BUZZWORDS_CSV)
buzzwords = {}
# Based on 'technical_buzzwords.csv' structure: 'buzzword' (topic) and 'terms' (comma-separated)
if 'buzzword' in df.columns and 'terms' in df.columns:
for index, row in df.iterrows():
topic = row['buzzword']
terms_str = row['terms']
if pd.notna(topic) and pd.notna(terms_str):
# Split comma separated terms and clean whitespace
terms_list = [t.strip() for t in str(terms_str).split(',')]
buzzwords[topic] = terms_list
else:
print(f"WARNING: Expected columns 'buzzword' and 'terms' in {BUZZWORDS_CSV}")
return buzzwords
except Exception as e:
print(f"ERROR loading buzzwords: {e}")
return {}
# --- 5. Global Config Object ---
# Load these once on import to be available everywhere
JOURNALS_LIST = load_journals()
BUZZWORDS_DICT = load_buzzwords()
if __name__ == "__main__":
# Sanity Checks when running config.py directly
print(f"Project Root: {PROJECT_ROOT}")
print(f"Journals Loaded: {len(JOURNALS_LIST)}")
if JOURNALS_LIST:
print(f"Sample Journal: {JOURNALS_LIST[0]}")
print(f"Buzzword Topics Loaded: {len(BUZZWORDS_DICT)}")
if BUZZWORDS_DICT:
sample_key = list(BUZZWORDS_DICT.keys())[0]
print(f"Sample Topic: {sample_key}, Terms: {BUZZWORDS_DICT[sample_key][:3]}...")