From 4762d8c838eb3b4beb104051e4b4b654aefea8ac Mon Sep 17 00:00:00 2001 From: asukhodko <24243464+asukhodko@users.noreply.github.com> Date: Thu, 8 Jan 2026 23:26:27 +0300 Subject: [PATCH] Improve README and documentation --- README.md | 169 ++++++++++++++--------------- docs/config.md | 42 +++++--- docs/debug_mode.md | 197 +++++++++++++++++----------------- docs/integrations/dify.md | 39 ++++--- docs/integrations/n8n.md | 26 ++--- docs/integrations/windmill.md | 22 ++-- docs/overview.md | 41 +++++++ docs/quickstart.md | 42 ++++++-- docs/renderers.md | 16 +-- docs/strategies.md | 22 ++-- 10 files changed, 350 insertions(+), 266 deletions(-) create mode 100644 docs/overview.md diff --git a/README.md b/README.md index 73f03bf..029455e 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,25 @@ # Chunkana -Intelligent Markdown chunking library for RAG systems. +[![GitHub Repository](https://img.shields.io/badge/GitHub-Chunkana-181717?logo=github)](https://github.com/asukhodko/chunkana) +[![PyPI version](https://img.shields.io/pypi/v/chunkana.svg)](https://pypi.org/project/chunkana/) +[![Python versions](https://img.shields.io/pypi/pyversions/chunkana.svg)](https://pypi.org/project/chunkana/) +[![License](https://img.shields.io/pypi/l/chunkana.svg)](LICENSE) +[![Downloads](https://img.shields.io/pypi/dm/chunkana.svg)](https://pypi.org/project/chunkana/) -## Features +**Chunkana** is a high-precision Markdown chunking library for RAG pipelines, search indexing, and LLM ingestion. It produces semantically correct Markdown chunks by respecting headers, code blocks, tables, and LaTeX while keeping the output retrieval-ready. -- 🧠 **Smart chunking**: Automatically selects optimal strategy based on content -- 📦 **Atomic blocks**: Preserves code blocks, tables, and LaTeX formulas -- 🌳 **Hierarchical**: Navigate chunks by header structure with tree invariant validation -- 📊 **Rich metadata**: Header paths, content types, overlap context -- 🔄 **Streaming**: Process large files (>10MB) efficiently -- 🎯 **Multiple renderers**: JSON, inline metadata, Dify-compatible -- ✅ **Quality assurance**: Automatic dangling header prevention and micro-chunk minimization +If you're looking for a **semantic Markdown chunker**, **Markdown splitter**, or **Markdown document segmenter** that preserves structure for LLM context windows, Chunkana is built for exactly that. + +## Why Chunkana + +Chunkana turns messy Markdown into clean, structured chunks that retain meaning: + +- **Semantic correctness**: preserves headers, lists, tables, code blocks, and math without splitting them mid-block. +- **RAG-ready metadata**: header paths, content types, line ranges, and overlap context. +- **Smart strategy selection**: automatically adapts to code-heavy, list-heavy, or structural documents. +- **Hierarchical navigation**: build a chunk tree for section-aware retrieval. +- **Streaming for large files**: chunk multi-megabyte documents without loading everything into memory. +- **Compatibility**: output formats for Dify and JSON APIs. ## Installation @@ -18,7 +27,13 @@ Intelligent Markdown chunking library for RAG systems. pip install chunkana ``` -## Quick Start +Optional extras: + +```bash +pip install "chunkana[docs]" +``` + +## Quick start ```python from chunkana import chunk_markdown @@ -42,10 +57,12 @@ def hello(): chunks = chunk_markdown(text) for chunk in chunks: - print(f"Lines {chunk.start_line}-{chunk.end_line}: {chunk.metadata['header_path']}") + print(f"{chunk.start_line}-{chunk.end_line}: {chunk.metadata['header_path']}") ``` -## Configuration +## Usage examples + +### 1) Tune chunk sizes and overlap ```python from chunkana import chunk_markdown, ChunkerConfig @@ -59,113 +76,83 @@ config = ChunkerConfig( chunks = chunk_markdown(text, config) ``` -### Hierarchical Chunking Configuration - -For hierarchical chunking with tree structure validation: +### 2) Build a hierarchical chunk tree ```python from chunkana import MarkdownChunker, ChunkConfig -config = ChunkConfig( - max_chunk_size=1000, - min_chunk_size=100, - overlap_size=100, - validate_invariants=True, # Enable tree invariant validation (default: True) - strict_mode=False, # Auto-fix violations vs raise exceptions (default: False) -) - -chunker = MarkdownChunker(config) +chunker = MarkdownChunker(ChunkConfig(validate_invariants=True)) result = chunker.chunk_hierarchical(text) -# Navigate the hierarchy root = result.get_chunk(result.root_id) children = result.get_children(result.root_id) -flat_chunks = result.get_flat_chunks() +flat_chunks = result.get_flat_chunks() # leaf + significant parent chunks ``` -**Configuration options:** -- `validate_invariants` (default: `True`): Validates tree invariants after construction -- `strict_mode` (default: `False`): When `True`, raises exceptions on invariant violations; when `False`, auto-fixes issues and logs warnings - -## Exception Handling - -Chunkana provides a hierarchy of exceptions for error handling: +### 3) Stream large Markdown files ```python -from chunkana import ( - ChunkanaError, # Base exception for all chunkana errors - HierarchicalInvariantError, # Tree structure violations - ValidationError, # Validation failures - ConfigurationError, # Invalid configuration - TreeConstructionError, # Tree building failures -) +from chunkana import MarkdownChunker -try: - result = chunker.chunk_hierarchical(text) -except HierarchicalInvariantError as e: - print(f"Invariant violation: {e.invariant}") - print(f"Chunk ID: {e.chunk_id}") - print(f"Suggested fix: {e.suggested_fix}") -except ChunkanaError as e: - print(f"Chunking error: {e}") +chunker = MarkdownChunker() +for chunk in chunker.chunk_file_streaming("docs/handbook.md"): + print(chunk.metadata["chunk_index"], chunk.size) ``` -## Renderers +### 4) Emit Dify-compatible output ```python from chunkana import chunk_markdown -from chunkana.renderers import render_dify_style, render_json +from chunkana.renderers import render_dify_style chunks = chunk_markdown(text) - -# JSON output -json_output = render_json(chunks) - -# Dify-compatible format -dify_output = render_dify_style(chunks) +output = render_dify_style(chunks) ``` -## Quality Features - -### Dangling Header Prevention - -Chunkana automatically prevents headers from being separated from their content. When a chunk would end with a header (like `#### Details`), the header is moved to the next chunk to maintain semantic coherence. - -### Micro-Chunk Minimization - -Small chunks are intelligently merged with adjacent content when they lack structural significance, reducing fragmentation while preserving important standalone elements like code blocks and tables. +### 5) Adaptive chunk sizing for mixed documents -### Tree Invariant Validation - -Hierarchical chunking validates: -- **is_leaf consistency**: Leaf status matches children presence -- **Parent-child bidirectionality**: All relationships are symmetric -- **No orphaned chunks**: Every chunk is reachable from root - -### Line Range Contract (Hierarchical Mode) +```python +from chunkana import chunk_markdown, ChunkerConfig +from chunkana.adaptive_sizing import AdaptiveSizeConfig -In hierarchical chunking mode, `start_line` and `end_line` follow a specific contract: +config = ChunkerConfig( + use_adaptive_sizing=True, + adaptive_config=AdaptiveSizeConfig( + base_size=1500, + code_weight=0.4, + min_size=500, + max_size=8000, + ), +) -- **Leaf nodes**: Line range covers only the chunk's own content -- **Internal nodes**: Line range covers only the node's own content (not children) -- **Root node**: Line range covers the entire document (1 to last line) +chunks = chunk_markdown(text, config) +``` -**Important**: The sum of children's line ranges does NOT equal the parent's range. The parent contains only its "header" content, while children contain detailed content. This is by design for hierarchical navigation. +## Renderers ```python -result = chunker.chunk_hierarchical(text) -root = result.get_chunk(result.root_id) +from chunkana.renderers import ( + render_dify_style, + render_json, + render_inline_metadata, + render_with_embedded_overlap, +) +``` -# Root covers entire document -print(f"Root: lines {root.start_line}-{root.end_line}") +- **render_dify_style** — `` blocks for Dify. +- **render_json** — list of dictionaries for JSON APIs. +- **render_inline_metadata** — HTML comment metadata inline. +- **render_with_embedded_overlap** — injects overlap into text for RAG windows. -# Children cover their own sections -for child in result.get_children(result.root_id): - print(f"Child: lines {child.start_line}-{child.end_line}") -``` +## Integrations + +- [Dify](docs/integrations/dify.md) +- [n8n](docs/integrations/n8n.md) +- [Windmill](docs/integrations/windmill.md) ## Documentation +- [Overview](docs/overview.md) - [Quick Start](docs/quickstart.md) - [Configuration](docs/config.md) - [Strategies](docs/strategies.md) @@ -173,6 +160,16 @@ for child in result.get_children(result.root_id): - [Debug Mode](docs/debug_mode.md) - [Migration Guide](MIGRATION_GUIDE.md) +## FAQ + +**Q: What makes Chunkana different from a basic Markdown splitter?** + +Chunkana is a **semantic Markdown chunker** that keeps structure intact (headers, lists, code blocks, tables, LaTeX) and enriches each chunk with retrieval metadata. This yields more accurate search and RAG results than naive line-based splitting. + +**Q: Does Chunkana work for RAG and LLM ingestion?** + +Yes. Chunkana is optimized for **RAG chunking**, **LLM context window preparation**, and **semantic Markdown segmentation**. It provides overlap metadata and consistent hierarchy paths for retrieval pipelines. + ## License MIT diff --git a/docs/config.md b/docs/config.md index 645693a..bd47401 100644 --- a/docs/config.md +++ b/docs/config.md @@ -2,7 +2,7 @@ Chunkana uses `ChunkerConfig` (alias: `ChunkConfig`) to control chunking behavior. -## Basic Parameters +## Basic parameters | Parameter | Type | Default | Description | |-----------|------|---------|-------------| @@ -12,7 +12,7 @@ Chunkana uses `ChunkerConfig` (alias: `ChunkConfig`) to control chunking behavio | `preserve_atomic_blocks` | bool | True | Keep code blocks, tables, LaTeX intact | | `extract_preamble` | bool | True | Extract content before first header as preamble | -## Strategy Selection Thresholds +## Strategy selection thresholds | Parameter | Type | Default | Description | |-----------|------|---------|-------------| @@ -20,9 +20,9 @@ Chunkana uses `ChunkerConfig` (alias: `ChunkConfig`) to control chunking behavio | `structure_threshold` | int | 3 | Minimum headers for Structural strategy | | `list_ratio_threshold` | float | 0.4 | List content ratio for ListAware strategy | | `list_count_threshold` | int | 5 | Minimum lists for ListAware strategy | -| `strategy_override` | str\|None | None | Force specific strategy: "code_aware", "list_aware", "structural", "fallback" | +| `strategy_override` | str\|None | None | Force strategy: "code_aware", "list_aware", "structural", "fallback" | -## Code-Context Binding +## Code-context binding These parameters control how code blocks are bound to surrounding explanations: @@ -35,7 +35,7 @@ These parameters control how code blocks are bound to surrounding explanations: | `bind_output_blocks` | bool | True | Bind code with its output blocks | | `preserve_before_after_pairs` | bool | True | Keep before/after code pairs together | -## Adaptive Sizing +## Adaptive sizing | Parameter | Type | Default | Description | |-----------|------|---------|-------------| @@ -55,7 +55,7 @@ adaptive_config = AdaptiveSizeConfig( ) ``` -## Table Grouping +## Table grouping | Parameter | Type | Default | Description | |-----------|------|---------|-------------| @@ -73,7 +73,7 @@ table_config = TableGroupingConfig( ) ``` -## Overlap Behavior +## Overlap behavior | Parameter | Type | Default | Description | |-----------|------|---------|-------------| @@ -82,7 +82,7 @@ table_config = TableGroupingConfig( The overlap is stored in metadata (`previous_content`, `next_content`), not embedded in `chunk.content`. -## LaTeX Handling +## LaTeX handling | Parameter | Type | Default | Description | |-----------|------|---------|-------------| @@ -90,13 +90,13 @@ The overlap is stored in metadata (`previous_content`, `next_content`), not embe When enabled, LaTeX blocks (`$$...$$`, `\[...\]`, `\begin{...}...\end{...}`) are treated as atomic units. -## Computed Fields +## Computed fields | Field | Description | |-------|-------------| | `enable_overlap` | Computed as `overlap_size > 0` | -## Factory Methods +## Factory methods ```python from chunkana import ChunkerConfig @@ -120,9 +120,19 @@ config = ChunkerConfig.from_dict(config_dict) Round-trip is guaranteed: `ChunkerConfig.from_dict(config.to_dict()) == config` -## Example Configurations +## Recommended presets -### Documentation Sites +### RAG pipelines + +```python +config = ChunkerConfig( + max_chunk_size=4096, + min_chunk_size=512, + overlap_size=200, +) +``` + +### Documentation sites ```python config = ChunkerConfig( @@ -133,7 +143,7 @@ config = ChunkerConfig( ) ``` -### Code Repositories +### Code repositories ```python config = ChunkerConfig( @@ -145,7 +155,7 @@ config = ChunkerConfig( ) ``` -### Changelogs / Release Notes +### Changelogs / release notes ```python config = ChunkerConfig( @@ -156,7 +166,7 @@ config = ChunkerConfig( ) ``` -### Scientific Documents (LaTeX) +### Scientific documents (LaTeX) ```python config = ChunkerConfig( @@ -166,6 +176,6 @@ config = ChunkerConfig( ) ``` -## Plugin Compatibility +## Plugin compatibility All 17 fields from dify-markdown-chunker's `ChunkConfig` are supported. See [Parity Matrix](migration/parity_matrix.md) for details. diff --git a/docs/debug_mode.md b/docs/debug_mode.md index 19102c3..39d41c3 100644 --- a/docs/debug_mode.md +++ b/docs/debug_mode.md @@ -1,98 +1,99 @@ -# Debug Mode Documentation - -## Overview - -This document describes the metadata behavior in chunkana across different chunking modes. - -## Metadata Behavior - -### Standard Chunking Mode - -When using `chunker.chunk(document)`, chunks contain: - -- `content_type`: Type of content (section, preamble, etc.) -- `header_path`: Hierarchical path to the chunk -- `header_level`: Level of the header (1-6) -- `chunk_index`: Position in the chunk list -- `strategy`: Chunking strategy used - -### Hierarchical Chunking Mode - -When using `chunker.chunk_hierarchical(document)`, chunks contain additional metadata: - -- All fields from standard mode -- `chunk_id`: Unique identifier for the chunk -- `parent_id`: ID of parent chunk -- `children_ids`: IDs of child chunks -- `is_leaf`: Whether chunk is a leaf node (no children) -- `is_root`: Whether chunk is the root node -- `hierarchy_level`: Depth in the tree (0=root, 1=sections, etc.) -- `prev_sibling_id`: ID of previous sibling (if any) -- `next_sibling_id`: ID of next sibling (if any) - -## Hierarchical Mode Specifics - -### get_flat_chunks() - -The `get_flat_chunks()` method returns chunks suitable for flat retrieval: - -1. **Leaf chunks** (no children) are always included -2. **Non-leaf chunks** with significant content (>100 chars excluding headers) are also included -3. **Root chunks** are excluded - -This ensures no content is lost when using flat retrieval mode. - -### Navigation Methods - -Navigation methods (`get_parent()`, `get_children()`, `get_siblings()`, `get_ancestors()`) work in hierarchical mode using internal chunk IDs. - -## Examples - -### Basic Usage - -```python -from chunkana import MarkdownChunker, ChunkConfig - -# Standard chunking -config = ChunkConfig(max_chunk_size=1000) -chunker = MarkdownChunker(config) -chunks = chunker.chunk(document) -``` - -### Hierarchical Mode - -```python -# Hierarchical chunking -config = ChunkConfig(max_chunk_size=1000) -chunker = MarkdownChunker(config) -result = chunker.chunk_hierarchical(document) - -# Access all chunks -all_chunks = result.chunks - -# Access only leaf chunks (for flat retrieval) -flat_chunks = result.get_flat_chunks() - -# Navigate hierarchy -root = result.get_chunk(result.root_id) -children = result.get_children(result.root_id) -``` - -## Invariant Validation - -Enable invariant validation to catch tree structure issues: - -```python -config = ChunkConfig( - max_chunk_size=1000, - min_chunk_size=100, - validate_invariants=True, # Enable invariant validation - strict_mode=False # Log warnings instead of raising exceptions -) -``` - -### Validated Invariants - -1. **is_leaf consistency**: `is_leaf` equals `(children_ids is empty)` -2. **Parent-child bidirectionality**: Parent-child relationships are mutual -3. **Content range consistency**: Root chunks have consistent content ranges +# Debug Mode Documentation + +## Overview + +This document describes the metadata behavior in Chunkana across different chunking modes. + +## Metadata behavior + +### Standard chunking mode + +When using `chunker.chunk(document)`, chunks contain: + +- `content_type`: Type of content (section, preamble, etc.) +- `header_path`: Hierarchical path to the chunk +- `header_level`: Level of the header (1-6) +- `chunk_index`: Position in the chunk list +- `strategy`: Chunking strategy used +- `previous_content` / `next_content`: Overlap content in metadata (if overlap enabled) + +### Hierarchical chunking mode + +When using `chunker.chunk_hierarchical(document)`, chunks contain additional metadata: + +- All fields from standard mode +- `chunk_id`: Unique identifier for the chunk +- `parent_id`: ID of parent chunk +- `children_ids`: IDs of child chunks +- `is_leaf`: Whether chunk is a leaf node (no children) +- `is_root`: Whether chunk is the root node +- `hierarchy_level`: Depth in the tree (0=root, 1=sections, etc.) +- `prev_sibling_id`: ID of previous sibling (if any) +- `next_sibling_id`: ID of next sibling (if any) + +## Hierarchical mode specifics + +### get_flat_chunks() + +The `get_flat_chunks()` method returns chunks suitable for flat retrieval: + +1. **Leaf chunks** (no children) are always included +2. **Non-leaf chunks** with significant content (>100 chars excluding headers) are also included +3. **Root chunks** are excluded + +This ensures no content is lost when using flat retrieval mode. + +### Navigation methods + +Navigation methods (`get_parent()`, `get_children()`, `get_siblings()`, `get_ancestors()`) work in hierarchical mode using internal chunk IDs. + +## Examples + +### Basic usage + +```python +from chunkana import MarkdownChunker, ChunkConfig + +# Standard chunking +config = ChunkConfig(max_chunk_size=1000) +chunker = MarkdownChunker(config) +chunks = chunker.chunk(document) +``` + +### Hierarchical mode + +```python +# Hierarchical chunking +config = ChunkConfig(max_chunk_size=1000) +chunker = MarkdownChunker(config) +result = chunker.chunk_hierarchical(document) + +# Access all chunks +all_chunks = result.chunks + +# Access only leaf chunks (for flat retrieval) +flat_chunks = result.get_flat_chunks() + +# Navigate hierarchy +root = result.get_chunk(result.root_id) +children = result.get_children(result.root_id) +``` + +## Invariant validation + +Enable invariant validation to catch tree structure issues: + +```python +config = ChunkConfig( + max_chunk_size=1000, + min_chunk_size=100, + validate_invariants=True, # Enable invariant validation + strict_mode=False # Log warnings instead of raising exceptions +) +``` + +### Validated invariants + +1. **is_leaf consistency**: `is_leaf` equals `(children_ids is empty)` +2. **Parent-child bidirectionality**: Parent-child relationships are mutual +3. **Content range consistency**: Root chunks have consistent content ranges diff --git a/docs/integrations/dify.md b/docs/integrations/dify.md index 9dd4fc3..58a54e9 100644 --- a/docs/integrations/dify.md +++ b/docs/integrations/dify.md @@ -2,7 +2,7 @@ Using Chunkana with Dify workflows. -## Quick Migration +## Quick migration ```python # Before (dify-markdown-chunker plugin) @@ -11,13 +11,14 @@ result = chunker.chunk(text, include_metadata=True) # After (Chunkana) from chunkana import chunk_markdown from chunkana.renderers import render_dify_style + chunks = chunk_markdown(text) result = render_dify_style(chunks) ``` -## Parameter Mapping +## Parameter mapping -### Tool Input Parameters +### Tool input parameters | Dify Tool Param | Chunkana Equivalent | |-----------------|---------------------| @@ -29,11 +30,11 @@ result = render_dify_style(chunks) | `include_metadata=False` | `render_with_embedded_overlap(chunks)` | | `enable_hierarchy=True` | `chunk_hierarchical(text, config)` | -### Config Fields +### Config fields All 17 plugin config fields are supported. See [Parity Matrix](../migration/parity_matrix.md). -## Basic Usage +## Basic usage ```python from chunkana import chunk_markdown, ChunkerConfig @@ -46,16 +47,16 @@ def process_document(text: str, include_metadata: bool = True) -> list[str]: min_chunk_size=512, overlap_size=200, ) - + chunks = chunk_markdown(text, config) - + if include_metadata: return render_dify_style(chunks) else: return render_with_embedded_overlap(chunks) ``` -## Metadata Format +## Metadata format With `render_dify_style()`, each chunk includes: @@ -67,7 +68,7 @@ With `render_dify_style()`, each chunk includes: Actual chunk content here... ``` -## Workflow Example +## Workflow example ```python # In Dify Code node @@ -77,42 +78,40 @@ from chunkana.renderers import render_dify_style def main(text: str) -> dict: chunks = chunk_markdown(text) formatted = render_dify_style(chunks) - + return { "chunks": formatted, "count": len(formatted), } ``` -## Hierarchical Chunking +## Hierarchical chunking ```python from chunkana import chunk_hierarchical +from chunkana.renderers import render_dify_style def main(text: str, debug: bool = False) -> dict: result = chunk_hierarchical(text) - + if debug: # Include all chunks (root, intermediate, leaf) chunks = result.get_all_chunks() else: # Only leaf chunks (default) chunks = result.get_flat_chunks() - + return {"chunks": render_dify_style(chunks)} ``` -## Common Pitfalls - -1. **Return type changed**: Plugin could return `List[str]` or `List[Chunk]`. Chunkana always returns `List[Chunk]` — use renderers for strings. +## Common pitfalls +1. **Return type changed**: The plugin could return `List[str]` or `List[Chunk]`. Chunkana always returns `List[Chunk]` — use renderers for strings. 2. **include_metadata is not a parameter**: Use renderer selection instead. - 3. **strategy="auto"**: In Chunkana, use `strategy_override=None` (default). - 4. **chunk_overlap vs overlap_size**: Plugin tool uses `chunk_overlap`, config uses `overlap_size`. -## Migration Verification +## Migration verification ```bash # Run baseline tests to verify parity @@ -120,6 +119,6 @@ pytest tests/baseline/test_canonical.py -v pytest tests/baseline/test_view_level.py -v ``` -## Full Migration Guide +## Full migration guide See [MIGRATION_GUIDE.md](../../MIGRATION_GUIDE.md) for detailed migration instructions. diff --git a/docs/integrations/n8n.md b/docs/integrations/n8n.md index 08ba6d4..013c706 100644 --- a/docs/integrations/n8n.md +++ b/docs/integrations/n8n.md @@ -10,7 +10,7 @@ Install Chunkana in your n8n Python environment: pip install chunkana ``` -## Code Node Example +## Code node example ```python from chunkana import chunk_markdown @@ -18,20 +18,20 @@ from chunkana.renderers import render_json def process(items): results = [] - + for item in items: text = item.get("text", "") chunks = chunk_markdown(text) - + results.append({ "chunks": render_json(chunks), "chunk_count": len(chunks), }) - + return results ``` -## With Configuration +## With configuration ```python from chunkana import chunk_markdown, ChunkerConfig @@ -44,19 +44,19 @@ config = ChunkerConfig( def process(items): results = [] - + for item in items: text = item.get("text", "") chunks = chunk_markdown(text, config) - + results.append({ "chunks": render_json(chunks), }) - + return results ``` -## Output Format +## Output format Each chunk in `render_json()` output: @@ -76,7 +76,7 @@ Each chunk in `render_json()` output: } ``` -## Streaming Large Documents +## Streaming large documents For large documents, use streaming: @@ -87,14 +87,14 @@ chunker = MarkdownChunker() def process_large(items): results = [] - + for item in items: file_path = item.get("file_path") chunks = list(chunker.chunk_file_streaming(file_path)) - + results.append({ "chunks": [c.to_dict() for c in chunks], }) - + return results ``` diff --git a/docs/integrations/windmill.md b/docs/integrations/windmill.md index 1fb35d2..881395c 100644 --- a/docs/integrations/windmill.md +++ b/docs/integrations/windmill.md @@ -11,7 +11,7 @@ Add `chunkana` to your script dependencies: #chunkana ``` -## Basic Script +## Basic script ```python #requirements: @@ -22,14 +22,14 @@ from chunkana.renderers import render_json def main(text: str) -> dict: chunks = chunk_markdown(text) - + return { "chunks": render_json(chunks), "count": len(chunks), } ``` -## With Configuration +## With configuration ```python #requirements: @@ -47,16 +47,16 @@ def main( max_chunk_size=max_chunk_size, overlap_size=overlap_size, ) - + chunks = chunk_markdown(text, config) - + return { "chunks": render_json(chunks), "count": len(chunks), } ``` -## Processing Files +## Processing files ```python #requirements: @@ -68,7 +68,7 @@ from chunkana.renderers import render_json def main(file_content: str) -> dict: chunker = MarkdownChunker() chunks = chunker.chunk(file_content) - + return { "chunks": render_json(chunks), "metadata": { @@ -78,7 +78,7 @@ def main(file_content: str) -> dict: } ``` -## Hierarchical Output +## Hierarchical output ```python #requirements: @@ -89,7 +89,7 @@ from chunkana import MarkdownChunker def main(text: str) -> dict: chunker = MarkdownChunker() result = chunker.chunk_hierarchical(text) - + return { "tree": result.to_tree_dict(), "flat_chunks": [c.to_dict() for c in result.get_flat_chunks()], @@ -97,7 +97,7 @@ def main(text: str) -> dict: } ``` -## Error Handling +## Error handling ```python #requirements: @@ -108,7 +108,7 @@ from chunkana import chunk_markdown def main(text: str) -> dict: if not text or not text.strip(): return {"chunks": [], "error": None} - + try: chunks = chunk_markdown(text) return { diff --git a/docs/overview.md b/docs/overview.md new file mode 100644 index 0000000..785c4cf --- /dev/null +++ b/docs/overview.md @@ -0,0 +1,41 @@ +# Overview + +Chunkana is a semantic Markdown chunking library designed for RAG pipelines, search indexing, and LLM ingestion. It splits Markdown into retrieval-ready chunks while preserving structure and context. + +## What Chunkana solves + +Traditional splitters break Markdown structure and cause semantic drift. Chunkana avoids that by: + +- Preserving **headers, lists, tables, code blocks, and LaTeX** as atomic units. +- Recording **header paths** and **content types** for structured retrieval. +- Providing **overlap metadata** for sliding window context. +- Supporting **hierarchical chunk trees** for section-aware navigation. + +## Typical use cases + +- **RAG pipelines**: get semantically coherent chunks and metadata for vector search. +- **LLM ingestion**: keep code examples and tables intact inside context windows. +- **Search indexing**: store header paths and types for faceted search. +- **Docs migration**: replace naive Markdown splitting with structural chunking. + +## How it works (high level) + +1. Chunkana analyzes the document structure and content ratios. +2. It selects an optimal strategy (code-aware, list-aware, structural, or fallback). +3. It builds chunks that respect Markdown syntax and your size constraints. +4. It emits chunks with metadata to support downstream retrieval. + +## Core concepts + +- **Chunk**: a unit of text with metadata like `header_path`, `content_type`, and line range. +- **Strategy**: the chunking algorithm selected per document. +- **Hierarchy**: a tree of chunks that mirrors header structure. +- **Renderers**: output formatters for Dify or JSON APIs. + +## Next steps + +- [Quick Start](quickstart.md) +- [Configuration](config.md) +- [Strategies](strategies.md) +- [Renderers](renderers.md) +- [Integrations](integrations/dify.md) diff --git a/docs/quickstart.md b/docs/quickstart.md index d6ce770..6c1467f 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -8,7 +8,7 @@ Get started with Chunkana in under a minute. pip install chunkana ``` -## Basic Usage +## Basic usage ```python from chunkana import chunk_markdown @@ -33,7 +33,7 @@ for chunk in chunks: print(f"Lines {chunk.start_line}-{chunk.end_line}: {chunk.content[:50]}...") ``` -## With Custom Configuration +## With custom configuration ```python from chunkana import chunk_markdown, ChunkerConfig @@ -47,7 +47,33 @@ config = ChunkerConfig( chunks = chunk_markdown(text, config) ``` -## Rendering Output +## Hierarchical chunking + +```python +from chunkana import MarkdownChunker, ChunkConfig + +chunker = MarkdownChunker(ChunkConfig(validate_invariants=True)) +result = chunker.chunk_hierarchical(text) + +# Leaf + significant parent chunks +flat_chunks = result.get_flat_chunks() + +# Navigate the hierarchy +root = result.get_chunk(result.root_id) +children = result.get_children(result.root_id) +``` + +## Streaming large documents + +```python +from chunkana import MarkdownChunker + +chunker = MarkdownChunker() +for chunk in chunker.chunk_file_streaming("docs/handbook.md"): + print(chunk.metadata["chunk_index"], chunk.size) +``` + +## Rendering output ```python from chunkana import chunk_markdown @@ -62,8 +88,10 @@ json_output = render_json(chunks) dify_output = render_dify_style(chunks) ``` -## Next Steps +## Next steps -- [Configuration Guide](config.md) — all configuration options -- [Strategies](strategies.md) — how chunking strategies work -- [Renderers](renderers.md) — output formatting options +- [Overview](overview.md) +- [Configuration Guide](config.md) +- [Strategies](strategies.md) +- [Renderers](renderers.md) +- [Integrations](integrations/dify.md) diff --git a/docs/renderers.md b/docs/renderers.md index e33cf43..bf78233 100644 --- a/docs/renderers.md +++ b/docs/renderers.md @@ -1,8 +1,8 @@ # Renderers -Renderers format chunk output without modifying the original chunks. +Renderers format chunk output without modifying the original chunks. Use them to emit Dify-compatible strings, JSON, or inline metadata for debugging. -## Available Renderers +## Available renderers ### render_dify_style @@ -77,7 +77,7 @@ output = render_inline_metadata(chunks) Keys are sorted alphabetically for deterministic output. -## Renderer Selection Guide +## Renderer selection guide | Use Case | Renderer | |----------|----------| @@ -88,7 +88,7 @@ Keys are sorted alphabetically for deterministic output. | RAG with sliding window | `render_with_prev_overlap` | | Debugging / inspection | `render_inline_metadata` | -## Decision Tree +## Decision tree ``` Need output for Dify plugin? @@ -101,7 +101,7 @@ Need output for Dify plugin? └── Need inline metadata → render_inline_metadata() ``` -## Important Notes +## Important notes 1. **Renderers don't modify chunks** — they only format output 2. **Overlap is in metadata** — `chunk.content` is always canonical (no embedded overlap) @@ -109,7 +109,7 @@ Need output for Dify plugin? 4. **Empty overlap handled** — missing `previous_content`/`next_content` is fine 5. **Deterministic** — same input always produces same output -## Custom Rendering +## Custom rendering For custom formats, access chunk data directly: @@ -121,11 +121,11 @@ for chunk in chunks: prev = chunk.metadata.get("previous_content", "") next_ = chunk.metadata.get("next_content", "") chunk_id = chunk.metadata.get("chunk_id", "") - + # Your custom formatting here ``` -## Plugin Compatibility +## Plugin compatibility | Plugin Parameter | Chunkana Renderer | |------------------|-------------------| diff --git a/docs/strategies.md b/docs/strategies.md index 7f68817..b29bb42 100644 --- a/docs/strategies.md +++ b/docs/strategies.md @@ -2,14 +2,14 @@ Chunkana automatically selects the best strategy based on document content analysis. -## Strategy Selection Order +## Strategy selection order 1. **CodeAware** (priority 1) — documents with code blocks or tables 2. **ListAware** (priority 2) — list-heavy documents 3. **Structural** (priority 3) — documents with hierarchical headers 4. **Fallback** (priority 4) — universal fallback -## CodeAware Strategy +## CodeAware strategy Selected when: - `code_block_count >= 1`, OR @@ -23,7 +23,9 @@ Features: - Binds code blocks to surrounding explanations (if enabled) - Groups related code blocks (before/after pairs, code/output) -## ListAware Strategy +Best for: technical docs, API guides, tutorials, Markdown with fenced code. + +## ListAware strategy Selected when (for non-structural documents): - `list_ratio > list_ratio_threshold` (default 0.4), OR @@ -37,7 +39,9 @@ Features: - Groups related list items - Handles checkbox lists with stats -## Structural Strategy +Best for: checklists, release notes, handbooks, policy docs. + +## Structural strategy Selected when: - `header_count >= structure_threshold` (default 3), AND @@ -49,7 +53,9 @@ Features: - Handles preamble (content before first header) - Preserves atomic blocks within sections -## Fallback Strategy +Best for: Markdown with clear H1/H2/H3 structure (docs, READMEs). + +## Fallback strategy Always available as last resort. @@ -58,7 +64,9 @@ Features: - Groups paragraphs to fit `max_chunk_size` - Preserves atomic blocks if present -## Forcing a Strategy +Best for: unstructured or minimal Markdown. + +## Forcing a strategy ```python from chunkana import chunk_markdown, ChunkerConfig @@ -70,7 +78,7 @@ chunks = chunk_markdown(text, config) Valid values: `"code_aware"`, `"list_aware"`, `"structural"`, `"fallback"` -## Strategy in Metadata +## Strategy in metadata Each chunk includes the strategy used: