sbabyanusha · fppcng · Jun 6, 2026 · Jun 6, 2026 · Jun 6, 2026 · Jun 8, 2026
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,10 @@
-api_config.py
 .env
+api_config.py
+__pycache__/
+*.pyc
+.venv/
+vector_store/
+experiments/
+metadata_inputs/
+pdf_metadata_outputs/
+xml_metadata_outputs/
diff --git a/README.md b/README.md
@@ -8,6 +8,7 @@ cBioAbstractor is a Streamlit-based curation assistant for cancer genomics studi
 
 - Upload a cancer genomics paper PDF
 - Upload supplementary data files such as `.xlsx`, `.csv`, `.tsv`, `.txt`, `.maf`, `.docx`, and `.pdf`
+- Download supplementary files automatically from PubMed Central using a PMCID or PMID
 - Extract study-level metadata from the paper
 - Classify supplementary sheets against cBioPortal file-format schemas
 - Identify likely cBioPortal target files
@@ -74,12 +75,16 @@ pip install -r requirements.txt
 
 ## API Key Setup
 
-Set your Anthropic API key locally as an environment variable:
+Set either an Anthropic or OpenAI API key locally as an environment variable:
 
 ```bash
 export ANTHROPIC_API_KEY="your-api-key"
+# or
+export OPENAI_API_KEY="your-api-key"
 ```
 
+The Streamlit sidebar lets you choose between Anthropic and OpenAI models.
+
 Do not commit API keys to GitHub.
 
 Recommended `.gitignore` entries:
@@ -113,7 +118,7 @@ http://localhost:8501
 
 1. Open the Streamlit app
 2. Upload the main paper PDF
-3. Upload one or more supplementary files
+3. Upload one or more supplementary files, or enter a PMCID/PMID to fetch them from PubMed Central
 4. Run the curation workflow
 5. Review detected file types, required fields, and missing fields
 6. Download the generated cBioPortal curation report
@@ -167,4 +172,3 @@ These examples help the app recognize recurring supplemental file patterns.
 - Supplementary file classification
 - cBioPortal format assessment
 - Curation report generation
-
diff --git a/cbio_detector.py b/cbio_detector.py
@@ -32,7 +32,7 @@
 
 import pandas as pd
 
-from config import FEW_SHOT_DIR, DETECTION_SAMPLE_ROWS, CBIO_FORMAT_IDS
+from config import FEW_SHOT_DIR, DETECTION_SAMPLE_ROWS
 
 logger = logging.getLogger(__name__)
 
@@ -238,14 +238,7 @@ def _heuristic_detect(df: pd.DataFrame) -> Tuple[Optional[str], float]:
 # LLM-powered detector (few-shot)
 # ---------------------------------------------------------------------------
 
-def _llm_detect(df: pd.DataFrame, examples: List[dict], api_key: str) -> Tuple[str, float, str]:
-    """
-    Use Claude to detect the file type with few-shot examples injected.
-    Returns (detected_type, confidence, reasoning).
-    """
-    import anthropic
-
-    # Build few-shot block
+def _build_detection_prompt(df: pd.DataFrame, examples: List[dict]) -> str:
     few_shot_block = ""
     for i, ex in enumerate(examples[:6]):  # max 6 examples to keep prompt manageable
         few_shot_block += f"""
@@ -262,7 +255,7 @@ def _llm_detect(df: pd.DataFrame, examples: List[dict], api_key: str) -> Tuple[s
     col_list = list(df.columns)
     sample_rows = df.head(DETECTION_SAMPLE_ROWS).to_csv(sep="\t", index=False)
 
-    prompt = f"""You are a bioinformatics data curation expert specializing in cBioPortal data formats.
+    return f"""You are a bioinformatics data curation expert specializing in cBioPortal data formats.
 
 Your task: identify which cBioPortal file type this supplemental data file represents.
 
@@ -295,20 +288,55 @@ def _llm_detect(df: pd.DataFrame, examples: List[dict], api_key: str) -> Tuple[s
 }}
 """
 
+
+def _parse_detection_response(raw: str) -> Tuple[str, float, str, dict]:
+    raw = raw.strip()
+    raw = re.sub(r"^```[^\n]*\n?", "", raw, flags=re.MULTILINE)
+    raw = re.sub(r"```$", "", raw, flags=re.MULTILINE).strip()
+
+    result = json.loads(raw)
+    return result["type"], float(result["confidence"]), result.get("reasoning", ""), result.get("column_mappings", {})
+
+
+def _llm_detect(df: pd.DataFrame, examples: List[dict], api_key: str) -> Tuple[str, float, str, dict]:
+    """
+    Use Claude to detect the file type with few-shot examples injected.
+    Returns (detected_type, confidence, reasoning, column_mappings).
+    """
+    import anthropic
+
+    prompt = _build_detection_prompt(df, examples)
     client = anthropic.Anthropic(api_key=api_key)
     response = client.messages.create(
         model="claude-sonnet-4-20250514",
         max_tokens=800,
         messages=[{"role": "user", "content": prompt}],
     )
 
-    raw = response.content[0].text.strip()
-    # Strip accidental markdown fences
-    raw = re.sub(r"^```[^\n]*\n?", "", raw, flags=re.MULTILINE)
-    raw = re.sub(r"```$", "", raw, flags=re.MULTILINE).strip()
+    return _parse_detection_response(response.content[0].text)
 
-    result = json.loads(raw)
-    return result["type"], float(result["confidence"]), result.get("reasoning", ""), result.get("column_mappings", {})
+
+def _openai_detect(
+    df: pd.DataFrame,
+    examples: List[dict],
+    api_key: str,
+    model: str = "gpt-5.5",
+) -> Tuple[str, float, str, dict]:
+    """
+    Use OpenAI to detect the file type with few-shot examples injected.
+    Returns (detected_type, confidence, reasoning, column_mappings).
+    """
+    from openai import OpenAI
+
+    prompt = _build_detection_prompt(df, examples)
+    client = OpenAI(api_key=api_key)
+    response = client.chat.completions.create(
+        model=model,
+        max_completion_tokens=800,
+        messages=[{"role": "user", "content": prompt}],
+    )
+
+    return _parse_detection_response(response.choices[0].message.content or "")
 
 
 # ---------------------------------------------------------------------------
@@ -319,6 +347,7 @@ def detect_file_type(
     df: pd.DataFrame,
     anthropic_api_key: Optional[str] = None,
     openai_api_key: Optional[str] = None,
+    openai_model: str = "gpt-5.5",
 ) -> dict:
     """
     Detect the cBioPortal format of a DataFrame.
@@ -366,6 +395,27 @@ def detect_file_type(
         except Exception as e:
             logger.error(f"LLM detection failed: {e}")
 
+    if openai_api_key:
+        try:
+            examples = load_few_shot_examples()
+            llm_type, llm_conf, reasoning, mappings = _openai_detect(
+                df,
+                examples,
+                openai_api_key,
+                model=openai_model,
+            )
+            logger.info(f"OpenAI detection: type={llm_type}, confidence={llm_conf:.2f}")
+            return {
+                "type": llm_type,
+                "confidence": llm_conf,
+                "method": "openai_few_shot",
+                "reasoning": reasoning,
+                "column_mappings": mappings,
+                "low_confidence": llm_conf < DETECTION_CONFIDENCE_THRESHOLD,
+            }
+        except Exception as e:
+            logger.error(f"OpenAI detection failed: {e}")
+
     # 3. Fallback: return best heuristic guess with low confidence flag
     return {
         "type": h_type or "clinical_sample",

diff --git a/cbioportal_curator.py b/cbioportal_curator.py
@@ -39,6 +39,9 @@
 
 from spec_match import classify_sheet, ClassificationResult
 from cbioportal_spec import SPEC_BY_KEY
+from xml_metadata import extract_metadata_from_xml as _extract_metadata_from_xml
+from xml_metadata import extract_xml_llm_text
+from xml_metadata import extract_xml_text
 
 # ─────────────────────────────────────────────────────────────
 # Constants
@@ -636,6 +639,11 @@ def _find_int(patterns, text, default="?"):
         "corresponding_authors": corresp,
     }
 
+
+def extract_metadata_from_xml(xml_source: str | bytes | Path) -> dict:
+    return _extract_metadata_from_xml(xml_source)
+
+
 def _extract_metadata_llm(pdf_text: str, model: str, temperature: float) -> dict:
     import json, logging
     llm = load_chat_model(model)
@@ -2519,4 +2527,4 @@ def curate(
             for r in records
         ],
     }
-    return {"report_path": output_path, "summary": summary}
+    return {"report_path": output_path, "summary": summary}
diff --git a/config.py b/config.py
@@ -6,9 +6,9 @@
 
 Notes
 -----
-The Anthropic API key is read directly from the ANTHROPIC_API_KEY environment
-variable (or Streamlit secret / sidebar input) inside streamlit_app.py, so it
-deliberately is not re-exported here.
+API keys are read directly from ANTHROPIC_API_KEY or OPENAI_API_KEY environment
+variables (or Streamlit secrets / sidebar input) inside streamlit_app.py, so
+they deliberately are not re-exported here.
 """
 import os
 

diff --git a/llm_client.py b/llm_client.py
@@ -0,0 +1,139 @@
+from __future__ import annotations
+
+import json
+import re
+import time
+from typing import Any
+
+
+def call_anthropic_with_retry(
+    client,
+    model: str,
+    system: str,
+    user_content: str,
+    max_tokens: int = 2000,
+    retries: int = 3,
+    backoff: float = 5.0,
+) -> str:
+    import anthropic
+
+    last_error: Exception | None = None
+    for attempt in range(retries):
+        try:
+            response = client.messages.create(
+                model=model,
+                max_tokens=max_tokens,
+                system=system,
+                messages=[{"role": "user", "content": user_content}],
+            )
+            return response.content[0].text
+        except anthropic.RateLimitError as exc:
+            last_error = exc
+            time.sleep(backoff * (attempt + 1))
+        except anthropic.APIStatusError as exc:
+            if exc.status_code >= 500:
+                last_error = exc
+                time.sleep(backoff * (attempt + 1))
+            else:
+                raise
+        except anthropic.APIConnectionError as exc:
+            last_error = exc
+            time.sleep(backoff * (attempt + 1))
+
+    raise last_error or RuntimeError("Anthropic API call failed after retries.")
+
+
+def call_openai_with_retry(
+    client,
+    model: str,
+    system: str,
+    user_content: str,
+    max_tokens: int = 2000,
+    retries: int = 3,
+    backoff: float = 5.0,
+) -> str:
+    import openai
+
+    last_error: Exception | None = None
+    for attempt in range(retries):
+        try:
+            response = client.chat.completions.create(
+                model=model,
+                max_completion_tokens=max_tokens,
+                messages=[
+                    {"role": "system", "content": system},
+                    {"role": "user", "content": user_content},
+                ],
+            )
+            content = response.choices[0].message.content or ""
+            if not content:
+                finish_reason = response.choices[0].finish_reason
+                usage = getattr(response, "usage", None)
+                raise RuntimeError(
+                    "OpenAI returned an empty message content. "
+                    f"finish_reason={finish_reason}, usage={usage}"
+                )
+            return content
+        except openai.RateLimitError as exc:
+            last_error = exc
+            time.sleep(backoff * (attempt + 1))
+        except openai.APIStatusError as exc:
+            if exc.status_code >= 500:
+                last_error = exc
+                time.sleep(backoff * (attempt + 1))
+            else:
+                raise
+        except openai.APIConnectionError as exc:
+            last_error = exc
+            time.sleep(backoff * (attempt + 1))
+
+    raise last_error or RuntimeError("OpenAI API call failed after retries.")
+
+
+def call_llm_with_retry(
+    provider: str,
+    api_key: str,
+    model: str,
+    system: str,
+    user_content: str,
+    max_tokens: int = 2000,
+) -> str:
+    if provider == "Anthropic":
+        import anthropic
+
+        client = anthropic.Anthropic(api_key=api_key)
+        return call_anthropic_with_retry(
+            client=client,
+            model=model,
+            system=system,
+            user_content=user_content,
+            max_tokens=max_tokens,
+        )
+
+    if provider == "OpenAI":
+        from openai import OpenAI
+
+        client = OpenAI(api_key=api_key)
+        return call_openai_with_retry(
+            client=client,
+            model=model,
+            system=system,
+            user_content=user_content,
+            max_tokens=max_tokens,
+        )
+
+    raise ValueError(f"Unsupported LLM provider: {provider}")
+
+
+def parse_llm_json(raw: str) -> dict[str, Any]:
+    cleaned = raw.strip()
+    cleaned = re.sub(r"^```[^\n]*\n?", "", cleaned, flags=re.MULTILINE)
+    cleaned = re.sub(r"```$", "", cleaned, flags=re.MULTILINE).strip()
+    try:
+        return json.loads(cleaned)
+    except json.JSONDecodeError:
+        start = cleaned.find("{")
+        end = cleaned.rfind("}")
+        if start >= 0 and end > start:
+            return json.loads(cleaned[start : end + 1])
+        raise