Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,10 @@
api_config.py
.env
api_config.py
__pycache__/
*.pyc
.venv/
vector_store/
experiments/
metadata_inputs/
pdf_metadata_outputs/
xml_metadata_outputs/
10 changes: 7 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ cBioAbstractor is a Streamlit-based curation assistant for cancer genomics studi

- Upload a cancer genomics paper PDF
- Upload supplementary data files such as `.xlsx`, `.csv`, `.tsv`, `.txt`, `.maf`, `.docx`, and `.pdf`
- Download supplementary files automatically from PubMed Central using a PMCID or PMID
- Extract study-level metadata from the paper
- Classify supplementary sheets against cBioPortal file-format schemas
- Identify likely cBioPortal target files
Expand Down Expand Up @@ -74,12 +75,16 @@ pip install -r requirements.txt

## API Key Setup

Set your Anthropic API key locally as an environment variable:
Set either an Anthropic or OpenAI API key locally as an environment variable:

```bash
export ANTHROPIC_API_KEY="your-api-key"
# or
export OPENAI_API_KEY="your-api-key"
```

The Streamlit sidebar lets you choose between Anthropic and OpenAI models.

Do not commit API keys to GitHub.

Recommended `.gitignore` entries:
Expand Down Expand Up @@ -113,7 +118,7 @@ http://localhost:8501

1. Open the Streamlit app
2. Upload the main paper PDF
3. Upload one or more supplementary files
3. Upload one or more supplementary files, or enter a PMCID/PMID to fetch them from PubMed Central
4. Run the curation workflow
5. Review detected file types, required fields, and missing fields
6. Download the generated cBioPortal curation report
Expand Down Expand Up @@ -167,4 +172,3 @@ These examples help the app recognize recurring supplemental file patterns.
- Supplementary file classification
- cBioPortal format assessment
- Curation report generation

82 changes: 66 additions & 16 deletions cbio_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@

import pandas as pd

from config import FEW_SHOT_DIR, DETECTION_SAMPLE_ROWS, CBIO_FORMAT_IDS
from config import FEW_SHOT_DIR, DETECTION_SAMPLE_ROWS

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -238,14 +238,7 @@ def _heuristic_detect(df: pd.DataFrame) -> Tuple[Optional[str], float]:
# LLM-powered detector (few-shot)
# ---------------------------------------------------------------------------

def _llm_detect(df: pd.DataFrame, examples: List[dict], api_key: str) -> Tuple[str, float, str]:
"""
Use Claude to detect the file type with few-shot examples injected.
Returns (detected_type, confidence, reasoning).
"""
import anthropic

# Build few-shot block
def _build_detection_prompt(df: pd.DataFrame, examples: List[dict]) -> str:
few_shot_block = ""
for i, ex in enumerate(examples[:6]): # max 6 examples to keep prompt manageable
few_shot_block += f"""
Expand All @@ -262,7 +255,7 @@ def _llm_detect(df: pd.DataFrame, examples: List[dict], api_key: str) -> Tuple[s
col_list = list(df.columns)
sample_rows = df.head(DETECTION_SAMPLE_ROWS).to_csv(sep="\t", index=False)

prompt = f"""You are a bioinformatics data curation expert specializing in cBioPortal data formats.
return f"""You are a bioinformatics data curation expert specializing in cBioPortal data formats.

Your task: identify which cBioPortal file type this supplemental data file represents.

Expand Down Expand Up @@ -295,20 +288,55 @@ def _llm_detect(df: pd.DataFrame, examples: List[dict], api_key: str) -> Tuple[s
}}
"""


def _parse_detection_response(raw: str) -> Tuple[str, float, str, dict]:
raw = raw.strip()
raw = re.sub(r"^```[^\n]*\n?", "", raw, flags=re.MULTILINE)
raw = re.sub(r"```$", "", raw, flags=re.MULTILINE).strip()

result = json.loads(raw)
return result["type"], float(result["confidence"]), result.get("reasoning", ""), result.get("column_mappings", {})


def _llm_detect(df: pd.DataFrame, examples: List[dict], api_key: str) -> Tuple[str, float, str, dict]:
"""
Use Claude to detect the file type with few-shot examples injected.
Returns (detected_type, confidence, reasoning, column_mappings).
"""
import anthropic

prompt = _build_detection_prompt(df, examples)
client = anthropic.Anthropic(api_key=api_key)
response = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=800,
messages=[{"role": "user", "content": prompt}],
)

raw = response.content[0].text.strip()
# Strip accidental markdown fences
raw = re.sub(r"^```[^\n]*\n?", "", raw, flags=re.MULTILINE)
raw = re.sub(r"```$", "", raw, flags=re.MULTILINE).strip()
return _parse_detection_response(response.content[0].text)

result = json.loads(raw)
return result["type"], float(result["confidence"]), result.get("reasoning", ""), result.get("column_mappings", {})

def _openai_detect(
df: pd.DataFrame,
examples: List[dict],
api_key: str,
model: str = "gpt-5.5",
) -> Tuple[str, float, str, dict]:
"""
Use OpenAI to detect the file type with few-shot examples injected.
Returns (detected_type, confidence, reasoning, column_mappings).
"""
from openai import OpenAI

prompt = _build_detection_prompt(df, examples)
client = OpenAI(api_key=api_key)
response = client.chat.completions.create(
model=model,
max_completion_tokens=800,
messages=[{"role": "user", "content": prompt}],
)

return _parse_detection_response(response.choices[0].message.content or "")


# ---------------------------------------------------------------------------
Expand All @@ -319,6 +347,7 @@ def detect_file_type(
df: pd.DataFrame,
anthropic_api_key: Optional[str] = None,
openai_api_key: Optional[str] = None,
openai_model: str = "gpt-5.5",
) -> dict:
"""
Detect the cBioPortal format of a DataFrame.
Expand Down Expand Up @@ -366,6 +395,27 @@ def detect_file_type(
except Exception as e:
logger.error(f"LLM detection failed: {e}")

if openai_api_key:
try:
examples = load_few_shot_examples()
llm_type, llm_conf, reasoning, mappings = _openai_detect(
df,
examples,
openai_api_key,
model=openai_model,
)
logger.info(f"OpenAI detection: type={llm_type}, confidence={llm_conf:.2f}")
return {
"type": llm_type,
"confidence": llm_conf,
"method": "openai_few_shot",
"reasoning": reasoning,
"column_mappings": mappings,
"low_confidence": llm_conf < DETECTION_CONFIDENCE_THRESHOLD,
}
except Exception as e:
logger.error(f"OpenAI detection failed: {e}")

# 3. Fallback: return best heuristic guess with low confidence flag
return {
"type": h_type or "clinical_sample",
Expand Down
10 changes: 9 additions & 1 deletion cbioportal_curator.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@

from spec_match import classify_sheet, ClassificationResult
from cbioportal_spec import SPEC_BY_KEY
from xml_metadata import extract_metadata_from_xml as _extract_metadata_from_xml
from xml_metadata import extract_xml_llm_text
from xml_metadata import extract_xml_text

# ─────────────────────────────────────────────────────────────
# Constants
Expand Down Expand Up @@ -636,6 +639,11 @@ def _find_int(patterns, text, default="?"):
"corresponding_authors": corresp,
}


def extract_metadata_from_xml(xml_source: str | bytes | Path) -> dict:
return _extract_metadata_from_xml(xml_source)


def _extract_metadata_llm(pdf_text: str, model: str, temperature: float) -> dict:
import json, logging
llm = load_chat_model(model)
Expand Down Expand Up @@ -2519,4 +2527,4 @@ def curate(
for r in records
],
}
return {"report_path": output_path, "summary": summary}
return {"report_path": output_path, "summary": summary}
6 changes: 3 additions & 3 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@

Notes
-----
The Anthropic API key is read directly from the ANTHROPIC_API_KEY environment
variable (or Streamlit secret / sidebar input) inside streamlit_app.py, so it
deliberately is not re-exported here.
API keys are read directly from ANTHROPIC_API_KEY or OPENAI_API_KEY environment
variables (or Streamlit secrets / sidebar input) inside streamlit_app.py, so
they deliberately are not re-exported here.
"""
import os

Expand Down
139 changes: 139 additions & 0 deletions llm_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
from __future__ import annotations

import json
import re
import time
from typing import Any


def call_anthropic_with_retry(
client,
model: str,
system: str,
user_content: str,
max_tokens: int = 2000,
retries: int = 3,
backoff: float = 5.0,
) -> str:
import anthropic

last_error: Exception | None = None
for attempt in range(retries):
try:
response = client.messages.create(
model=model,
max_tokens=max_tokens,
system=system,
messages=[{"role": "user", "content": user_content}],
)
return response.content[0].text
except anthropic.RateLimitError as exc:
last_error = exc
time.sleep(backoff * (attempt + 1))
except anthropic.APIStatusError as exc:
if exc.status_code >= 500:
last_error = exc
time.sleep(backoff * (attempt + 1))
else:
raise
except anthropic.APIConnectionError as exc:
last_error = exc
time.sleep(backoff * (attempt + 1))

raise last_error or RuntimeError("Anthropic API call failed after retries.")


def call_openai_with_retry(
client,
model: str,
system: str,
user_content: str,
max_tokens: int = 2000,
retries: int = 3,
backoff: float = 5.0,
) -> str:
import openai

last_error: Exception | None = None
for attempt in range(retries):
try:
response = client.chat.completions.create(
model=model,
max_completion_tokens=max_tokens,
messages=[
{"role": "system", "content": system},
{"role": "user", "content": user_content},
],
)
content = response.choices[0].message.content or ""
if not content:
finish_reason = response.choices[0].finish_reason
usage = getattr(response, "usage", None)
raise RuntimeError(
"OpenAI returned an empty message content. "
f"finish_reason={finish_reason}, usage={usage}"
)
return content
except openai.RateLimitError as exc:
last_error = exc
time.sleep(backoff * (attempt + 1))
except openai.APIStatusError as exc:
if exc.status_code >= 500:
last_error = exc
time.sleep(backoff * (attempt + 1))
else:
raise
except openai.APIConnectionError as exc:
last_error = exc
time.sleep(backoff * (attempt + 1))

raise last_error or RuntimeError("OpenAI API call failed after retries.")


def call_llm_with_retry(
provider: str,
api_key: str,
model: str,
system: str,
user_content: str,
max_tokens: int = 2000,
) -> str:
if provider == "Anthropic":
import anthropic

client = anthropic.Anthropic(api_key=api_key)
return call_anthropic_with_retry(
client=client,
model=model,
system=system,
user_content=user_content,
max_tokens=max_tokens,
)

if provider == "OpenAI":
from openai import OpenAI

client = OpenAI(api_key=api_key)
return call_openai_with_retry(
client=client,
model=model,
system=system,
user_content=user_content,
max_tokens=max_tokens,
)

raise ValueError(f"Unsupported LLM provider: {provider}")


def parse_llm_json(raw: str) -> dict[str, Any]:
cleaned = raw.strip()
cleaned = re.sub(r"^```[^\n]*\n?", "", cleaned, flags=re.MULTILINE)
cleaned = re.sub(r"```$", "", cleaned, flags=re.MULTILINE).strip()
try:
return json.loads(cleaned)
except json.JSONDecodeError:
start = cleaned.find("{")
end = cleaned.rfind("}")
if start >= 0 and end > start:
return json.loads(cleaned[start : end + 1])
raise
Loading