From e3c3f3637f1aaceca23e76ec36792d077601df63 Mon Sep 17 00:00:00 2001 From: "Bhavnick @ Chonkie" Date: Wed, 1 Jul 2026 13:47:16 -0700 Subject: [PATCH] Release 0.0.2: ship pulpie.markdown; migrate chonkie-ai/chonkie-inc references to feyninc --- README.md | 14 +++++++------- pyproject.toml | 10 +++++----- scripts/cc-filter/AGENT_PROMPT.md | 2 +- src/pulpie/__init__.py | 2 +- src/pulpie/model_utils.py | 6 +++--- 5 files changed, 17 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index f53c94d..62cc3fe 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,16 @@
- - pulpie + + pulpie [![PyPI version](https://img.shields.io/pypi/v/pulpie.svg)](https://pypi.org/project/pulpie/) [![Python](https://img.shields.io/badge/python-3.9%2B-blue.svg)](https://pypi.org/project/pulpie/) -[![License](https://img.shields.io/github/license/chonkie-inc/pulpie.svg)](https://github.com/chonkie-inc/pulpie/blob/main/LICENSE) +[![License](https://img.shields.io/github/license/feyninc/pulpie.svg)](https://github.com/feyninc/pulpie/blob/main/LICENSE) [![Downloads](https://static.pepy.tech/badge/pulpie)](https://pepy.tech/project/pulpie) [![Blog](https://img.shields.io/badge/blog-read%20the%20writeup-E34C26.svg)](https://usefeyn.com/blog/pulpie-pareto-optimal-models-for-cleaning-the-web/) -[![GitHub stars](https://img.shields.io/github/stars/chonkie-inc/pulpie.svg)](https://github.com/chonkie-inc/pulpie/stargazers) +[![GitHub stars](https://img.shields.io/github/stars/feyninc/pulpie.svg)](https://github.com/feyninc/pulpie/stargazers) _Pareto-optimal models for cleaning the web. Extract main content from HTML at one twentieth the cost._ @@ -94,9 +94,9 @@ All three models are built on [EuroBERT](https://arxiv.org/abs/2503.05500), shar | Model | Hugging Face | Params | ROUGE-5 F1 | Notes | |-------|--------------|--------|------------|-------| -| **Orange Small** | [`chonkie-ai/pulpie-orange-small`](https://huggingface.co/chonkie-ai/pulpie-orange-small) | 210M | 0.862 | **Recommended**, best size-to-quality ratio | -| Orange Base | [`chonkie-ai/pulpie-orange-base`](https://huggingface.co/chonkie-ai/pulpie-orange-base) | 610M | 0.863 | Distilled from Large | -| Orange Large | [`chonkie-ai/pulpie-orange-large`](https://huggingface.co/chonkie-ai/pulpie-orange-large) | 2.1B | 0.873 | Teacher (highest quality) | +| **Orange Small** | [`feyninc/pulpie-orange-small`](https://huggingface.co/feyninc/pulpie-orange-small) | 210M | 0.862 | **Recommended**, best size-to-quality ratio | +| Orange Base | [`feyninc/pulpie-orange-base`](https://huggingface.co/feyninc/pulpie-orange-base) | 610M | 0.863 | Distilled from Large | +| Orange Large | [`feyninc/pulpie-orange-large`](https://huggingface.co/feyninc/pulpie-orange-large) | 2.1B | 0.873 | Teacher (highest quality) | `orange-small` is the default. Despite being a third the size of Dripper (the leading extractor), it matches its quality (0.862 vs 0.864) while running 20x faster. diff --git a/pyproject.toml b/pyproject.toml index 319e9ba..d9949b4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,13 +4,13 @@ build-backend = "setuptools.build_meta" [project] name = "pulpie" -version = "0.0.1" +version = "0.0.2" description = "Fast content extraction from HTML using encoder models." readme = "README.md" license = {text = "Apache-2.0"} requires-python = ">=3.9" authors = [ - {name = "Chonkie AI", email = "team@chonkie.ai"}, + {name = "Feyn", email = "team@usefeyn.com"}, ] keywords = ["html", "content-extraction", "web", "nlp", "encoder", "transformer"] classifiers = [ @@ -46,9 +46,9 @@ dev = [ ] [project.urls] -Homepage = "https://github.com/chonkie-inc/pulpie" -Documentation = "https://github.com/chonkie-inc/pulpie" -Repository = "https://github.com/chonkie-inc/pulpie" +Homepage = "https://github.com/feyninc/pulpie" +Documentation = "https://github.com/feyninc/pulpie" +Repository = "https://github.com/feyninc/pulpie" [tool.setuptools.packages.find] where = ["src"] diff --git a/scripts/cc-filter/AGENT_PROMPT.md b/scripts/cc-filter/AGENT_PROMPT.md index 41a2cc4..f584f0a 100644 --- a/scripts/cc-filter/AGENT_PROMPT.md +++ b/scripts/cc-filter/AGENT_PROMPT.md @@ -10,7 +10,7 @@ You're setting up a CPU-only pipeline on a fresh GCP VM (target: `c2-standard-16 1. `HF_TOKEN` with write access to the `chonkie-ai` org 2. A `--start` / `--end` WARC index range to process (full range is 0–100000; ask which shard this box should own so it doesn't overlap with other boxes) -3. A way to get `stream_filter_upload.py` onto the box. It lives in this repo at `scripts/cc-filter/stream_filter_upload.py`. Easiest: `git clone https://github.com/chonkie-inc/pulpie` and copy the file out, or `gh repo clone chonkie-inc/pulpie` if `gh` is installed. +3. A way to get `stream_filter_upload.py` onto the box. It lives in this repo at `scripts/cc-filter/stream_filter_upload.py`. Easiest: `git clone https://github.com/feyninc/pulpie` and copy the file out, or `gh repo clone feyninc/pulpie` if `gh` is installed. **Authoritative setup guide:** `RECREATE_SETUP.md` in this same directory. Follow it exactly. Condensed version: diff --git a/src/pulpie/__init__.py b/src/pulpie/__init__.py index 9d9e0b7..5382fbe 100644 --- a/src/pulpie/__init__.py +++ b/src/pulpie/__init__.py @@ -4,5 +4,5 @@ from pulpie.pipeline import PageInput, PageResult, Pipeline from pulpie.simplify import simplify -__version__ = "0.0.1" +__version__ = "0.0.2" __all__ = ["ExtractionResult", "Extractor", "PageInput", "PageResult", "Pipeline", "simplify"] diff --git a/src/pulpie/model_utils.py b/src/pulpie/model_utils.py index 1d275cd..03bc874 100644 --- a/src/pulpie/model_utils.py +++ b/src/pulpie/model_utils.py @@ -10,9 +10,9 @@ from pulpie.chunker import SEP_TOKEN MODELS = { - "orange-small": "chonkie-ai/pulpie-orange-small", - "orange-base": "chonkie-ai/pulpie-orange-base", - "orange-large": "chonkie-ai/pulpie-orange-large", + "orange-small": "feyninc/pulpie-orange-small", + "orange-base": "feyninc/pulpie-orange-base", + "orange-large": "feyninc/pulpie-orange-large", } ITEM_ID_PATTERN = re.compile(r'_item_id="(\d+)"')