Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion api/routes/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ def _run_analysis(
if "llm_combine" not in config:
config["llm_combine"] = {}
config["llm_combine"]["model_name"] = model_name
if "gpt" in model_name or "o1" in model_name or "o3" in model_name:
if model_name.startswith("gpt") or "o1" in model_name or "o3" in model_name:
config["llm_combine"]["model_type"] = "openai"
else:
config["llm_combine"]["model_type"] = "local"
Expand Down
24 changes: 12 additions & 12 deletions config.yml
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
#model_names: gpt-4o, o1-preview, o1-mini
#model_type: "openai" #"openai / local / aitta
#model_type: "openai" #"openai / local
llm_rag:
model_type: "openai"
model_type: "openai"
model_name: "gpt-5-mini" # used only for RAGs
llm_smart: #used only in smart_agent
model_type: "openai"
model_type: "openai"
model_name: "gpt-5.2" # used only for smart agent
llm_combine: #used only in combine_agent and intro
model_type: "openai"
model_type: "openai"
model_name: "gpt-5.2" # used only for combine agent ("mkchaou/climsight-calm_ft_Q3_13k")
llm_dataanalysis: #used only in data_analysis_agent
model_type: "openai"
Expand All @@ -20,6 +20,7 @@ use_smart_agent: true
use_era5_data: true # Download ERA5 time series from CDS API (requires credentials)
use_destine_data: true # Download DestinE projections via HDA API (requires DESP credentials)
use_powerful_data_analysis: true
llm_local_endpoint_url: "http://localhost:8000/v1"

# ERA5 Climatology Configuration (pre-computed observational baseline)
era5_climatology:
Expand Down Expand Up @@ -210,23 +211,22 @@ ecocrop:
data_path: "./data/ecocrop/ecocrop_database/"
rag_settings:
rag_activated: True
# Which embedding backend to use: openai, aitta, mistral, etc.
embedding_model_type: "openai" # options: openai, aitta, mistral
# Which embedding backend to use: openai, local, mistral, etc.
embedding_model_type: "openai" # options: openai, local, mistral
local_endpoint_url: "http://localhost:8000/v1"
# Embedding model name for each backend
embedding_model_openai: "text-embedding-3-large"
embedding_model_aitta: "lightonai/modernbert-embed-large"
embedding_model_local: "lightonai/modernbert-embed-large"
# Add more as needed, e.g.:
# embedding_model_mistral: "mistral-embed-xyz"
# Chroma DB paths for each backend
chroma_path_ipcc_openai: "rag_db/ipcc_reports_openai"
chroma_path_ipcc_aitta: "rag_db/ipcc_reports_aitta"
chroma_path_ipcc_local: "rag_db/ipcc_reports_local"
# chroma_path_ipcc_mistral: "rag_db/ipcc_reports_mistral"
chroma_path_general_openai: "rag_db/general_reports_openai"
chroma_path_general_aitta: "rag_db/general_reports_aitta"
chroma_path_general_local: "rag_db/general_reports_local"
# chroma_path_general_mistral: "rag_db/general_reports_mistral"
# AITTA configuration for open models (optional, only needed for aitta)
aitta_url: "https://api-climatedt-aitta.2.rahtiapp.fi"
document_path: './data/general_reports/' # or ipcc_text_reports
document_path: './data/ipcc_text_reports/' # or general_reports
chunk_size: 2000
chunk_overlap: 200
separators: [" ", ",", "\n"]
Expand Down
2 changes: 2 additions & 0 deletions frontend/src/components/SettingsPanel.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ const MODEL_OPTIONS = [
'gpt-4.1-nano',
'gpt-4.1-mini',
'gpt-4.1',
'openai/gpt-oss-120b',
'meta-llama/Llama-3.3-70B-Instruct',
];

const CLIMATE_SOURCES = [
Expand Down
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,6 @@ dependencies = [
]

[project.optional-dependencies]
aitta = ["aitta-client"]
dev = ["pytest", "flake8"]

[build-system]
Expand Down
81 changes: 28 additions & 53 deletions rag/db_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import os
import logging
import tqdm
import yaml
import re

Expand All @@ -20,7 +21,6 @@
# Import the new embedding utility
import sys
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'src'))
from climsight.embedding_utils import create_embeddings

logger = logging.getLogger(__name__)
logging.basicConfig(
Expand Down Expand Up @@ -141,22 +141,18 @@ def split_docs(documents, chunk_size=2000, chunk_overlap=200, separators=[" ", "
return docs


def chunk_and_embed_documents(document_path, embedding_model, openai_api_key, aitta_url, model_type, chunk_size=2000, chunk_overlap=200, separators=[" ", ",", "\n"]):
def chunk_documents(document_path, chunk_size=2000, chunk_overlap=200, separators=[" ", ",", "\n"]):
"""
Chunks and embeds documents from the specified directory using provided embedding function.
Chunks documents from the specified directory.

Args:
- document_path (str): The path to the directory containing the documents.
- embedding_model (str): The embedding model name to use for generating embeddings.
- openai_api_key (str): OpenAI API key for OpenAI models.
- aitta_url (str): AITTA API URL for open models.
- model_type (str): The type of embedding model backend (e.g., 'openai', 'aitta').
- chunk_size (int): maximum number of characters per chunk. Default: 2000.
- chunk_overlap (int): number of characters to overlap per chunk. Default: 200.
- separators (list): list of characters where text can be split. Default: [" ", ",", "\n"]

Returns:
- list: A list of documents with embeddings.
- list: A list of chunked documents.
"""
# load documents
file_names = get_file_names(document_path)
Expand All @@ -167,7 +163,7 @@ def chunk_and_embed_documents(document_path, embedding_model, openai_api_key, ai
all_documents.extend(documents) # save all of them into one

if not all_documents:
logger.info("No documents found for chunking and embedding.")
logger.info("No documents found for chunking.")
return []

# Chunk documents
Expand All @@ -180,27 +176,7 @@ def chunk_and_embed_documents(document_path, embedding_model, openai_api_key, ai

logger.info(f"Chunked documents into {len(chunked_docs)} pieces.")

# Create embedding model using the utility function
try:
aitta_api_key = os.getenv('AITTA_API_KEY')
embedding_item = create_embeddings(
embedding_model=embedding_model,
openai_api_key=openai_api_key,
aitta_api_key=aitta_api_key,
aitta_url=aitta_url,
model_type=model_type
)
# embedding documents
embedded_docs = []
for doc in chunked_docs:
embedding = embedding_item.embed_documents([doc.page_content])[0] # embed_documents returns a list, so we take the first element
embedded_docs.append({"text": doc.page_content, "embedding": embedding, "metadata": doc.metadata})
except Exception as e:
logger.error(f"Failed to embed document chunks: {e}")
return []

logger.info(f"Embedded {len(embedded_docs)} document chunks.")
return embedded_docs
return chunked_docs


def initialize_rag(config):
Expand All @@ -222,9 +198,9 @@ def initialize_rag(config):
if embedding_model_type == 'openai':
embedding_model = rag_settings.get('embedding_model_openai')
chroma_path = rag_settings.get('chroma_path_ipcc_openai')
elif embedding_model_type == 'aitta':
embedding_model = rag_settings.get('embedding_model_aitta')
chroma_path = rag_settings.get('chroma_path_ipcc_aitta')
elif embedding_model_type == 'local':
embedding_model = rag_settings.get('embedding_model_local')
chroma_path = rag_settings.get('chroma_path_ipcc_local')
# Add more types here as needed
# elif embedding_model_type == 'mistral':
# embedding_model = rag_settings.get('embedding_model_mistral')
Expand All @@ -233,8 +209,7 @@ def initialize_rag(config):
raise ValueError(f"Unknown embedding_model_type: {embedding_model_type}")

openai_api_key = os.getenv('OPENAI_API_KEY')
aitta_api_key = os.getenv('AITTA_API_KEY')
aitta_url = rag_settings.get('aitta_url', os.getenv('AITTA_URL', 'https://api-climatedt-aitta.2.rahtiapp.fi'))
local_api_key = os.getenv('OPENAI_API_KEY_LOCAL')
document_path = rag_settings['document_path']
chunk_size = rag_settings['chunk_size']
chunk_overlap = rag_settings['chunk_overlap']
Expand All @@ -244,8 +219,8 @@ def initialize_rag(config):
if embedding_model_type == 'openai' and not openai_api_key:
logger.warning("No OpenAI API Key found. Skipping RAG initialization.")
return False
if embedding_model_type == 'aitta' and not aitta_api_key:
logger.warning("No AITTA API Key found. Skipping RAG initialization.")
if embedding_model_type == 'local' and not local_api_key:
logger.warning("No local API Key found. Skipping RAG initialization.")
return False

# check if documents are present and valid
Expand All @@ -255,24 +230,24 @@ def initialize_rag(config):

# Perform chunking and embedding
try:
langchain_ef = create_embeddings(
embedding_model=embedding_model,
openai_api_key=openai_api_key,
aitta_api_key=aitta_api_key,
aitta_url=aitta_url,
model_type=embedding_model_type
)
documents = chunk_and_embed_documents(document_path, embedding_model, openai_api_key, aitta_url, embedding_model_type, chunk_size, chunk_overlap, separators)
converted_documents = [
Document(page_content=doc['text'], metadata=doc['metadata'])
for doc in documents
]
rag_db = Chroma.from_documents(
documents=converted_documents,
if config['rag_settings']['embedding_model_type'] == 'local':
langchain_ef = OpenAIEmbeddings(
api_key=local_api_key,
base_url=rag_settings.get('local_endpoint_url'),
model=embedding_model,
tiktoken_enabled=False
)
else:
langchain_ef = OpenAIEmbeddings(api_key=openai_api_key, model=embedding_model)
documents = chunk_documents(document_path, chunk_size, chunk_overlap, separators)
rag_db = Chroma(
collection_name="ipcc_collection",
persist_directory=chroma_path,
embedding=langchain_ef,
collection_name="ipcc_collection"
embedding_function=langchain_ef
)
batch_size = 32
for i in tqdm.tqdm(range(0, len(documents), batch_size)):
rag_db.add_documents(documents[i:i+batch_size])
rag_ready = True
logger.info(f"RAG ready: {rag_ready}")
logger.info("RAG database has been initialized and documents embedded.")
Expand Down
21 changes: 5 additions & 16 deletions src/climsight/climsight_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@
write_climate_data_manifest,
)
# import smart_agent
from smart_agent import get_aitta_chat_model, smart_agent
from smart_agent import smart_agent
# import data_analysis_agent
from data_analysis_agent import data_analysis_agent
# import predefined data preparation functions
Expand Down Expand Up @@ -663,7 +663,7 @@ def agent_llm_request(content_message, input_params, config, api_key, api_key_lo
logger.info(f"start agent_request")
if config['llm_combine']['model_type'] == "local":
llm_combine_agent = ChatOpenAI(
openai_api_base="http://localhost:8000/v1",
openai_api_base=config['llm_local_endpoint_url'],
model_name=config['llm_combine']['model_name'], # Match the exact model name you used
openai_api_key=api_key_local,
max_tokens=16000,
Expand All @@ -688,20 +688,14 @@ def agent_llm_request(content_message, input_params, config, api_key, api_key_lo
max_tokens=16000,
)
llm_intro = llm_combine_agent
elif config['llm_combine']['model_type'] == 'aitta':
llm_combine_agent = get_aitta_chat_model(
config['llm_combine']['model_name'],
max_completion_tokens=4096
)
llm_intro = llm_combine_agent

# Data analysis LLM (separate from combine step).
llm_dataanalysis_cfg = config.get("llm_dataanalysis")
if not llm_dataanalysis_cfg:
raise RuntimeError("Missing llm_dataanalysis configuration.")
if llm_dataanalysis_cfg.get("model_type") == "local":
llm_dataanalysis_agent = ChatOpenAI(
openai_api_base="http://localhost:8000/v1",
openai_api_base=config["llm_local_endpoint_url"],
model_name=llm_dataanalysis_cfg.get("model_name"),
openai_api_key=api_key_local,
max_tokens=16000,
Expand All @@ -712,11 +706,6 @@ def agent_llm_request(content_message, input_params, config, api_key, api_key_lo
model_name=llm_dataanalysis_cfg.get("model_name"),
max_tokens=16000,
)
elif llm_dataanalysis_cfg.get("model_type") == "aitta":
llm_dataanalysis_agent = get_aitta_chat_model(
llm_dataanalysis_cfg.get("model_name"),
max_completion_tokens=4096
)
else:
llm_dataanalysis_agent = llm_combine_agent

Expand Down Expand Up @@ -1187,7 +1176,7 @@ class routeResponse(BaseModel):
# Pass the dictionary to invoke
input = {"user_text": state.user}
response = chain.invoke(input)
elif config['llm_combine']['model_type'] in ("local", "aitta"):
elif config['llm_combine']['model_type'] == "local":
prompt_text = intro_prompt.format(user_text=state.user)
response_raw = llm_intro.invoke(prompt_text)
import re, json
Expand Down Expand Up @@ -1275,7 +1264,7 @@ def combine_agent(state: AgentState):
state.content_message += "\n ECOCROP Search Response: {ecocrop_search_response} "
logger.info(f"Ecocrop_search_response: {state.ecocrop_search_response}")

if config['llm_combine']['model_type'] in ("local", "aitta"):
if config['llm_combine']['model_type'] == "local":
system_message_prompt = SystemMessagePromptTemplate.from_template(config['system_role'])
elif config['llm_combine']['model_type'] == "openai":
if "o1" in config['llm_combine']['model_name']:
Expand Down
72 changes: 0 additions & 72 deletions src/climsight/embedding_utils.py

This file was deleted.

Loading
Loading