eugen-goebel · dependabot · Mar 26, 2026 · Mar 26, 2026 · Mar 29, 2026 · Mar 29, 2026
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,12 @@
+__pycache__/
+*.py[cod]
+.pytest_cache/
+.env
+venv/
+.venv/
+.git/
+.github/
+output/
+*.db
+.DS_Store
+
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,34 @@
+---
+name: Bug report
+about: Report something that is not working as expected
+title: '[BUG] '
+labels: bug
+---
+
+## Description
+
+A clear and concise description of the bug.
+
+## Steps to Reproduce
+
+1.
+2.
+3.
+
+## Expected Behavior
+
+What you expected to happen.
+
+## Actual Behavior
+
+What actually happened. Include the full error message or stack trace if applicable.
+
+## Environment
+
+- OS:
+- Python version:
+- Project commit / version:
+
+## Additional Context
+
+Any other context, screenshots, or sample inputs.
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,22 @@
+---
+name: Feature request
+about: Suggest an idea or improvement
+title: '[FEATURE] '
+labels: enhancement
+---
+
+## Problem
+
+What problem does this solve? Who would benefit?
+
+## Proposed Solution
+
+How could it work?
+
+## Alternatives Considered
+
+Any other approaches you considered.
+
+## Additional Context
+
+Mockups, references, or related issues.
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -0,0 +1,17 @@
+version: 2
+updates:
+  - package-ecosystem: "pip"
+    directory: "/"
+    schedule:
+      interval: "weekly"
+    open-pull-requests-limit: 5
+    labels:
+      - "dependencies"
+
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "monthly"
+    labels:
+      - "dependencies"
+      - "ci"
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
@@ -0,0 +1,23 @@
+## Summary
+
+<!-- Brief description of what this PR does and why. -->
+
+## Changes
+
+<!-- Bullet list of changes. -->
+-
+-
+
+## Testing
+
+- [ ] All existing tests pass (`pytest -v`)
+- [ ] Added or updated tests for new behavior
+- [ ] Manually verified the Streamlit app
+
+## Screenshots (if UI changes)
+
+<!-- Drag images here if relevant. -->
+
+## Related Issues
+
+<!-- Closes #N -->
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -0,0 +1,30 @@
+name: Tests
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.10", "3.11", "3.12"]
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+
+      - name: Run tests
+        run: pytest -v
diff --git a/.gitignore b/.gitignore
@@ -14,6 +14,7 @@ venv/
 
 # ChromaDB local storage
 vectorstore/chroma_data/
+.chroma_data/
 
 # IDE
 .vscode/
@@ -24,7 +25,7 @@ vectorstore/chroma_data/
 Thumbs.db
 
 # Streamlit
-.streamlit/
+.streamlit/secrets.toml
 
 # Output
 output/
diff --git a/.streamlit/secrets.toml.example b/.streamlit/secrets.toml.example
@@ -0,0 +1,9 @@
+# Example secrets file for Streamlit Cloud deployment.
+#
+# Local development: copy .env.example to .env and set ANTHROPIC_API_KEY there.
+# Streamlit Cloud:   paste the contents below into the Secrets editor in the
+#                    app dashboard (https://share.streamlit.io → your app → Settings → Secrets).
+#
+# DO NOT COMMIT a real .streamlit/secrets.toml file — it is gitignored.
+
+ANTHROPIC_API_KEY = "sk-ant-..."
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -0,0 +1,43 @@
+# Contributing
+
+Thanks for your interest! This is primarily a personal portfolio project, but contributions are welcome.
+
+## Getting Started
+
+1. Fork the repository and clone your fork.
+2. Create and activate a virtual environment:
+   ```bash
+   python3 -m venv venv
+   source venv/bin/activate
+   ```
+3. Install dependencies:
+   ```bash
+   pip install -r requirements.txt
+   ```
+4. Copy `.env.example` to `.env` and add your `ANTHROPIC_API_KEY` (tests work without one — they mock the client).
+5. Run the test suite:
+   ```bash
+   pytest -v
+   ```
+6. Try the Streamlit app:
+   ```bash
+   streamlit run app.py
+   ```
+
+## Submitting Changes
+
+1. Create a feature branch from `main`:
+   ```bash
+   git checkout -b feature/your-feature
+   ```
+2. Make focused, well-described commits.
+3. Make sure the test suite passes locally before pushing.
+4. Open a pull request against `main` with a clear description of what you changed and why. Reference any related issues.
+
+## Code Style
+
+- Follow PEP 8 for Python code.
+- Add tests for any new behavior — agent tests should mock the Anthropic client.
+- Do not commit any vector store data (`.chroma_data/`) or sample documents you do not own.
+- Update the README if user-facing behavior changes.
+- Keep changes focused — one PR, one concern.
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,12 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY . .
+
+EXPOSE 8501
+
+CMD ["streamlit", "run", "app.py", "--server.address", "0.0.0.0", "--server.port", "8501"]
diff --git a/README.md b/README.md
@@ -2,10 +2,26 @@
 
 An AI-powered Retrieval-Augmented Generation (RAG) system that lets you chat with your documents. Upload PDF, DOCX, or TXT files and ask questions — answers are grounded in your document content with source references.
 
+![CI](https://github.com/eugen-goebel/smart-doc-qa/actions/workflows/tests.yml/badge.svg)
 ![Python](https://img.shields.io/badge/Python-3.10+-blue)
+![Tests](https://img.shields.io/badge/Tests-passed-brightgreen)
 ![Streamlit](https://img.shields.io/badge/Streamlit-1.40+-red)
+![ChromaDB](https://img.shields.io/badge/ChromaDB-0.5+-orange)
 ![License](https://img.shields.io/badge/License-MIT-green)
 
+> **Try it live:** _(Streamlit Cloud demo URL will be added once deployed — see [Deployment](#deployment) section.)_
+
+## Screenshots
+
+**Demo Mode** — clean landing view; runs without an API key using raw retrieval results
+![Landing](docs/screenshots/01-landing.png)
+
+**Question Answered** — asking about 2025 revenue returns the most relevant chunk with source reference
+![Question Answered](docs/screenshots/02-question-answered.png)
+
+**Retrieved Chunks** — similarity search surfaces multiple ranked matches across the document
+![Retrieved Chunks](docs/screenshots/03-retrieved-chunks.png)
+
 ## How It Works
 
 ```
@@ -33,7 +49,7 @@ An AI-powered Retrieval-Augmented Generation (RAG) system that lets you chat wit
 
 ```bash
 # Clone and setup
-git clone https://github.com/YOUR_USERNAME/smart-doc-qa.git
+git clone https://github.com/eugen-goebel/smart-doc-qa.git
 cd smart-doc-qa
 python3 -m venv venv && source venv/bin/activate
 pip install -r requirements.txt
@@ -129,6 +145,31 @@ pytest tests/test_vectorstore.py -v
 
 All tests run without an API key. The QA agent tests use mocked API responses.
 
+## Deployment
+
+This app is designed to deploy in one click on **Streamlit Community Cloud** (free tier).
+
+**Steps:**
+
+1. Sign in at [share.streamlit.io](https://share.streamlit.io) with your GitHub account.
+2. Click **New app** and pick this repository / branch / `app.py`.
+3. _(Optional)_ In **Advanced settings → Secrets**, paste:
+   ```toml
+   ANTHROPIC_API_KEY = "sk-ant-..."
+   ```
+   See [`.streamlit/secrets.toml.example`](.streamlit/secrets.toml.example).
+4. Click **Deploy**. The app builds in ~2 minutes.
+
+**API key handling:**
+
+The app reads the key from three places, in this order:
+
+1. `os.environ["ANTHROPIC_API_KEY"]` — set via `.env` for local runs
+2. `st.secrets["ANTHROPIC_API_KEY"]` — set in Streamlit Cloud dashboard
+3. Manual entry in the sidebar — fallback for end users
+
+If no key is provided, the app runs in **Demo Mode**: vector search still works, but the model-generated answer step is skipped and the raw retrieved chunks are shown instead.
+
 ## License
 
 MIT
diff --git a/SECURITY.md b/SECURITY.md
@@ -0,0 +1,23 @@
+# Security Policy
+
+## Reporting a Vulnerability
+
+If you discover a security vulnerability in this project, please report it privately by emailing **eugen-goebel@hotmail.de**.
+
+Please do not file public GitHub issues for security vulnerabilities, as this could expose users to risk before a fix is available.
+
+## Response Time
+
+I aim to acknowledge reports within 7 days and provide an initial assessment within 14 days.
+
+## Supported Versions
+
+This is a portfolio project; only the latest commit on `main` is supported.
+
+## API Key Handling
+
+This project uses the `ANTHROPIC_API_KEY` environment variable. Never commit your API key — use the provided `.env.example` as a template and keep your real `.env` file out of version control (it is gitignored).
+
+## Document Privacy
+
+When using this RAG system, uploaded documents are processed locally and stored in the local ChromaDB vector store. Only the chunks selected by retrieval are sent to the Claude API at query time. Do not upload sensitive documents to public deployments without reviewing the data flow.
diff --git a/agents/document_loader.py b/agents/document_loader.py
@@ -1,5 +1,5 @@
 """
-Document Loader — Reads PDF, DOCX, and TXT files and extracts plain text.
+Document Loader — Reads PDF, DOCX, TXT, and Markdown files and extracts plain text.
 
 Think of this as a "translator" that converts different file formats into
 a single format (plain text) that the rest of the pipeline can work with.
@@ -8,6 +8,7 @@
   - .pdf  → uses pypdf to extract text from each page
   - .docx → uses python-docx to read paragraphs
   - .txt  → reads the file directly
+  - .md   → reads the file directly (Markdown is already human-readable)
 """
 
 import os
@@ -32,7 +33,7 @@ class LoadedDocument(BaseModel):
         char_count: Total number of characters in the text
     """
     filename: str = Field(description="Original filename")
-    format: str = Field(description="File format: pdf, docx, or txt")
+    format: str = Field(description="File format: pdf, docx, txt, or md")
     text: str = Field(description="Full extracted text")
     page_count: int = Field(description="Number of pages (1 for txt/docx)")
     char_count: int = Field(description="Total characters in the text")
@@ -42,7 +43,7 @@ class LoadedDocument(BaseModel):
 # Supported file extensions
 # ---------------------------------------------------------------------------
 
-SUPPORTED_FORMATS = {".pdf", ".docx", ".txt"}
+SUPPORTED_FORMATS = {".pdf", ".docx", ".txt", ".md"}
 
 
 # ---------------------------------------------------------------------------
@@ -91,6 +92,8 @@ def load(self, filepath: str) -> LoadedDocument:
             text, page_count = self._read_pdf(filepath)
         elif ext == ".docx":
             text, page_count = self._read_docx(filepath)
+        elif ext == ".md":
+            text, page_count = self._read_md(filepath)
         else:
             text, page_count = self._read_txt(filepath)
 
@@ -135,3 +138,9 @@ def _read_txt(self, filepath: str) -> tuple[str, int]:
         with open(filepath, "r", encoding="utf-8") as f:
             text = f.read()
         return text, 1
+
+    def _read_md(self, filepath: str) -> tuple[str, int]:
+        """Read a Markdown file. Markdown is already human-readable plain text."""
+        with open(filepath, "r", encoding="utf-8") as f:
+            text = f.read()
+        return text, 1
diff --git a/agents/vectorstore.py b/agents/vectorstore.py
@@ -29,6 +29,8 @@
 
 from .chunker import TextChunk
 
+DEFAULT_PERSIST_DIR = ".chroma_data"
+
 
 # ---------------------------------------------------------------------------
 # Data model
@@ -171,6 +173,14 @@ def search(self, query: str, top_k: int = 5) -> list[SearchResult]:
 
         return search_results
 
+    def list_sources(self) -> list[str]:
+        """Return a sorted list of unique source filenames in the store."""
+        if self._collection.count() == 0:
+            return []
+        all_meta = self._collection.get(include=["metadatas"])
+        sources = {m["source"] for m in all_meta["metadatas"] if "source" in m}
+        return sorted(sources)
+
     def reset(self):
         """Delete all stored chunks (start fresh)."""
         self._client.delete_collection(self._collection.name)