BibliometricAnalysis/config.py at main · uihilab/BibliometricAnalysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118

"""
Configuration module for Bibliometric Analysis Pipeline.

This module handles:
1. Environment variables (API Keys).
2. Directory paths.
3. Loading of domain-specific configuration (Journals, Buzzwords) from CSV files.
"""

import os
import pandas as pd
import sys

# --- 1. Environment & API Keys ---
# Try to load from environment variables, or fallback to manual input if needed.
# Users should preferably set these in their environment or a .env file.
ELSEVIER_API_KEY = os.getenv("ELSEVIER_API_KEY", "")
INST_TOKEN = os.getenv("INST_TOKEN", "")

if not ELSEVIER_API_KEY:
    print("WARNING: ELSEVIER_API_KEY is not set. API calls will fail.")

# --- 2. Directory Paths ---
# Determine the absolute path of the project root
# Determine the absolute path of the script directory
BASE_DIR = os.path.dirname(os.path.abspath(__file__))

# Define Data and Results directories relative to this script folder
# This makes the "bibliometric_analysis" folder self-contained.
DATA_DIR = os.path.join(BASE_DIR, "data")
RAW_DATA_DIR = os.path.join(DATA_DIR, "raw")
PROCESSED_DATA_DIR = os.path.join(DATA_DIR, "processed")
RESULTS_DIR = os.path.join(BASE_DIR, "results")
FIGURES_DIR = os.path.join(RESULTS_DIR, "figures")

# Ensure directories exist
for d in [DATA_DIR, RAW_DATA_DIR, PROCESSED_DATA_DIR, RESULTS_DIR, FIGURES_DIR]:
    os.makedirs(d, exist_ok=True)

# Input Files
JOURNALS_CSV = os.path.join(DATA_DIR, "journals.csv")
BUZZWORDS_CSV = os.path.join(DATA_DIR, "technical_buzzwords.csv")

# --- 3. Analysis Parameters ---
CONTEXT_WINDOW_SIZE = 75 # Characters before/after keyword match

# --- 4. Data Loading Helpers ---

def load_journals():
    """
    Loads the list of target journals from CSV.
    Returns:
        list: A list of journal names.
    """
    if not os.path.exists(JOURNALS_CSV):
        print(f"WARNING: Journals file not found at {JOURNALS_CSV}")
        return []

    try:
        df = pd.read_csv(JOURNALS_CSV)
        # Assuming column 'journal' exists based on previous analysis
        if 'journal' in df.columns:
            return df['journal'].dropna().unique().tolist()
        else:
            print(f"WARNING: 'journal' column not found in {JOURNALS_CSV}")
            return []
    except Exception as e:
        print(f"ERROR loading journals: {e}")
        return []

def load_buzzwords():
    """
    Loads technical buzzwords and their terms.
    Returns:
        dict: { 'Major Topic': ['term1', 'term2', ...], ... }
    """
    if not os.path.exists(BUZZWORDS_CSV):
        print(f"WARNING: Buzzwords file not found at {BUZZWORDS_CSV}")
        return {}

    try:
        df = pd.read_csv(BUZZWORDS_CSV)
        buzzwords = {}

        # Based on 'technical_buzzwords.csv' structure: 'buzzword' (topic) and 'terms' (comma-separated)
        if 'buzzword' in df.columns and 'terms' in df.columns:
            for index, row in df.iterrows():
                topic = row['buzzword']
                terms_str = row['terms']
                if pd.notna(topic) and pd.notna(terms_str):
                    # Split comma separated terms and clean whitespace
                    terms_list = [t.strip() for t in str(terms_str).split(',')]
                    buzzwords[topic] = terms_list
        else:
             print(f"WARNING: Expected columns 'buzzword' and 'terms' in {BUZZWORDS_CSV}")

        return buzzwords
    except Exception as e:
        print(f"ERROR loading buzzwords: {e}")
        return {}

# --- 5. Global Config Object ---
# Load these once on import to be available everywhere
JOURNALS_LIST = load_journals()
BUZZWORDS_DICT = load_buzzwords()

if __name__ == "__main__":
    # Sanity Checks when running config.py directly
    print(f"Project Root: {PROJECT_ROOT}")
    print(f"Journals Loaded: {len(JOURNALS_LIST)}")
    if JOURNALS_LIST:
        print(f"Sample Journal: {JOURNALS_LIST[0]}")

    print(f"Buzzword Topics Loaded: {len(BUZZWORDS_DICT)}")
    if BUZZWORDS_DICT:
        sample_key = list(BUZZWORDS_DICT.keys())[0]
        print(f"Sample Topic: {sample_key}, Terms: {BUZZWORDS_DICT[sample_key][:3]}...")