From 2174609a0ae9c709b4b8d5e44452a18b943af9b3 Mon Sep 17 00:00:00 2001
From: Anas Dorbani <95044293+anasdorbani@users.noreply.github.com>
Date: Tue, 10 Mar 2026 14:31:23 -0400
Subject: [PATCH 1/6] Improve v0.7.0 documentation (#249)
---
README.md | 21 +-
docs/docs/audio-support.md | 233 +++++++++++++++++++
docs/docs/developer-guide.md | 119 ++++++++++
docs/docs/faq.mdx | 4 +-
docs/docs/getting-started/getting-started.md | 8 +-
docs/docs/image-support.md | 16 +-
docs/docs/llm-metrics.md | 121 ++++++++++
docs/docs/model-parameters.md | 2 +-
docs/docs/structured-output.md | 11 +-
docs/docs/what-is-flock.md | 29 ++-
scripts/build_and_run.sh | 19 +-
11 files changed, 555 insertions(+), 28 deletions(-)
create mode 100644 docs/docs/audio-support.md
create mode 100644 docs/docs/developer-guide.md
create mode 100644 docs/docs/llm-metrics.md
diff --git a/README.md b/README.md
index 6806f1d4..a5f88134 100644
--- a/README.md
+++ b/README.md
@@ -66,9 +66,22 @@ To cite the project:
## 🔥 Features
- **Declarative SQL Interface**: Perform text generation, classification, summarization, filtering, and embedding generation using SQL queries.
-- **Multi-Provider Support**: Easily integrate with OpenAI, Azure, and Ollama for your AI needs.
+- **Multi-Provider Support**: Easily integrate with **OpenAI**, **Azure**, **Ollama**, and **Anthropic/Claude** for your AI needs.
- **End-to-End RAG Pipelines**: Enable retrieval and augmentation workflows for enhanced analytics.
- **Map and Reduce Functions**: Intuitive APIs for combining semantic tasks and data analytics directly in DuckDB.
+- **Multimodal Analytics**: First-class support for text, images, and audio (via transcription) directly in SQL.
+- **LLM Observability**: Built-in metrics tracking for tokens, latency, and call counts across Flock LLM functions.
+- **Browser & WASM Support**: Run Flock-powered DuckDB workloads in the browser via DuckDB-WASM.
+
+## ✨ Key Highlights (v0.4.0 and later)
+
+- **Anthropic/Claude Provider**: Use Claude models as a **fourth provider**, alongside OpenAI, Azure, and Ollama, with full support for structured output and image analysis.
+- **WASM Support**: Compile Flock as a DuckDB-WASM loadable extension to run in the browser, enabling client-side analytics and demos without server infrastructure.
+- **LLM Metrics Tracking**: Track token usage, API latency, and execution time through dedicated functions like `flock_get_metrics()` for better cost and performance monitoring.
+- **Audio Transcription**: Send audio inputs to OpenAI or Azure and obtain text transcripts using the same `context_columns` abstraction (with `type: 'audio'`).
+- **DuckDB v1.4.4**: Upgraded to DuckDB **1.4.4**, inheriting the latest performance and stability improvements.
+- **Architecture Improvements**: Centralized bind data and RAII-based storage guards reduce duplication and improve robustness across scalar and aggregate functions.
+- **Developer Experience**: Interactive build scripts, improved extension CI tooling, and GitHub Copilot agent instructions streamline local development and contributions.
🔝 back to top
@@ -76,11 +89,12 @@ To cite the project:
### 📝 Prerequisites
-1. **DuckDB**: Version 1.1.1 or later. Install it from the official [DuckDB installation guide](https://duckdb.org/docs/installation/).
+1. **DuckDB**: Version **1.4.4 or later**. Install it from the official [DuckDB installation guide](https://duckdb.org/docs/installation/).
2. **Supported Providers**: Ensure you have credentials or API keys for at least one of the supported providers:
- OpenAI
- Azure
- Ollama
+ - Anthropic/Claude
3. **Supported OS**:
- Linux
- macOS
@@ -160,6 +174,9 @@ SELECT llm_complete(
Explore more usage examples in the [documentation](https://dais-polymtl.github.io/flock/docs/what-is-flock).
+If you are a contributor or want to work on Flock itself, see the dedicated
+[Developer Guide](https://dais-polymtl.github.io/flock/docs/developer-guide) for build, testing, and contribution details.
+
🔝 back to top
## 🛣️ Roadmap
diff --git a/docs/docs/audio-support.md b/docs/docs/audio-support.md
new file mode 100644
index 00000000..db7bc489
--- /dev/null
+++ b/docs/docs/audio-support.md
@@ -0,0 +1,233 @@
+---
+title: Audio Transcription
+sidebar_position: 7
+---
+
+# Audio Transcription in Flock
+
+Flock supports audio transcription in SQL by sending audio inputs to compatible providers and returning text transcripts
+that you can join, filter, and analyze like any other column.
+
+import TOCInline from '@theme/TOCInline';
+
+
+
+## Overview
+
+With audio support you can:
+
+- Transcribe spoken content (meetings, calls, notes) directly in DuckDB.
+- Combine transcripts with structured data for analytics.
+- Feed transcripts into `llm_complete`, `llm_filter`, or `llm_embedding` for downstream tasks (summarization,
+ classification, similarity search, RAG, etc.).
+
+Flock uses the same `context_columns` abstraction as for images, but with `type: 'audio'` and a required
+`transcription_model`.
+
+## Supported Providers
+
+Audio transcription is supported for:
+
+- **OpenAI** – via the `audio/transcriptions` endpoint (e.g., Whisper models).
+- **Azure OpenAI** – via the Azure audio transcription endpoint.
+
+The following providers **do not** support audio transcription:
+
+- **Anthropic/Claude** – not supported; calls will raise an error.
+- **Ollama** – not supported; calls will raise an error.
+
+Refer to the provider-specific getting-started guides for API key setup:
+
+- [OpenAI](/docs/getting-started/openai)
+- [Azure](/docs/getting-started/azure)
+- [Anthropic](/docs/getting-started/anthropic) (for completions/vision only, no audio)
+
+## Using Audio in Context Columns
+
+To use audio in Flock functions, specify `type: 'audio'` and provide a `transcription_model` in the `context_columns`
+array. The audio must be accessible as a file path or URL (depending on the provider).
+
+### Context Column Structure for Audio
+
+```sql
+'context_columns': [
+ {
+ 'data': audio_path,
+ 'type': 'audio',
+ 'transcription_model': 'whisper-1'
+ }
+]
+```
+
+Each audio context column supports:
+
+- **`data`** _(required)_: SQL column containing the audio source (local file path or URL, depending on provider).
+- **`type`** _(required for audio)_: Must be set to `'audio'`.
+- **`transcription_model`** _(required when `type = 'audio'`)_: Provider-specific transcription model name.
+- **`name`** _(optional)_: Alias for referencing in prompts after transcription.
+
+### Validation Rules
+
+Flock enforces the following rules at bind time:
+
+- If `type = 'audio'`, then `transcription_model` **must** be provided, otherwise an error is raised.
+- If `transcription_model` is provided but `type` is not `'audio'`, Flock raises an error.
+
+## Basic Transcription Example
+
+The most common pattern is to transcribe audio into text, then store or further process the transcript.
+
+```sql
+-- Transcribe a list of audio files with OpenAI
+SELECT
+ audio_id,
+ file_path,
+ llm_complete(
+ {'model_name': 'gpt-4o'},
+ {
+ 'prompt': 'Transcribe the following audio file verbatim.',
+ 'context_columns': [
+ {
+ 'data': file_path,
+ 'type': 'audio',
+ 'transcription_model': 'whisper-1'
+ }
+ ]
+ }
+ ) AS transcript
+FROM VALUES
+ (1, '/data/audio/meeting_01.mp3'),
+ (2, '/data/audio/meeting_02.mp3')
+AS t(audio_id, file_path);
+```
+
+## Summarizing Transcripts
+
+After transcription, you can treat the transcript as regular text and chain additional LLM calls.
+
+```sql
+WITH raw_transcripts AS (
+ SELECT
+ audio_id,
+ llm_complete(
+ {'model_name': 'gpt-4o'},
+ {
+ 'prompt': 'Transcribe the following audio file verbatim.',
+ 'context_columns': [
+ {
+ 'data': file_path,
+ 'type': 'audio',
+ 'transcription_model': 'whisper-1'
+ }
+ ]
+ }
+ ) AS transcript
+ FROM VALUES
+ (1, '/data/audio/support_call_01.wav'),
+ (2, '/data/audio/support_call_02.wav')
+ AS t(audio_id, file_path)
+)
+SELECT
+ audio_id,
+ llm_complete(
+ {'model_name': 'gpt-4o'},
+ {
+ 'prompt': 'Summarize this call in 3 bullet points.',
+ 'context_columns': [
+ {'data': transcript, 'name': 'call'}
+ ]
+ }
+ ) AS call_summary
+FROM raw_transcripts;
+```
+
+## Filtering Based on Audio Content
+
+You can also use `llm_filter` to flag or select rows based on the audio’s content:
+
+```sql
+-- Flag calls that mention cancellations
+SELECT
+ audio_id,
+ customer_id,
+ file_path
+FROM VALUES
+ (1, 101, '/data/audio/call_01.wav'),
+ (2, 102, '/data/audio/call_02.wav'),
+ (3, 103, '/data/audio/call_03.wav')
+AS t(audio_id, customer_id, file_path)
+WHERE llm_filter(
+ {'model_name': 'gpt-4o'},
+ {
+ 'prompt': 'Does this call mention cancelling a subscription? Answer true or false.',
+ 'context_columns': [
+ {
+ 'data': file_path,
+ 'type': 'audio',
+ 'transcription_model': 'whisper-1'
+ }
+ ]
+ }
+);
+```
+
+## Embeddings from Audio (via Text)
+
+There is no direct audio embedding API in Flock. Instead, you can:
+
+1. Transcribe audio into text.
+2. Generate embeddings from the transcript using `llm_embedding`.
+
+```sql
+WITH transcripts AS (
+ SELECT
+ audio_id,
+ llm_complete(
+ {'model_name': 'gpt-4o'},
+ {
+ 'prompt': 'Transcribe the following audio file.',
+ 'context_columns': [
+ {
+ 'data': file_path,
+ 'type': 'audio',
+ 'transcription_model': 'whisper-1'
+ }
+ ]
+ }
+ ) AS transcript
+ FROM VALUES
+ (1, '/data/audio/note_01.m4a'),
+ (2, '/data/audio/note_02.m4a')
+ AS t(audio_id, file_path)
+),
+audio_embeddings AS (
+ SELECT
+ audio_id,
+ llm_embedding(
+ {'model_name': 'text-embedding-3-small'},
+ {
+ 'context_columns': [
+ {'data': transcript}
+ ]
+ }
+ ) AS embedding
+ FROM transcripts
+)
+SELECT * FROM audio_embeddings;
+```
+
+## Function Support for Audio
+
+Audio transcription is available in the following functions (via `type: 'audio'` + `transcription_model`):
+
+| Function | Audio Support | Description |
+| --------------- | ------------- | -------------------------------------------- |
+| `llm_complete` | âś… Full | Transcribe and optionally transform content |
+| `llm_filter` | âś… Full | Filter rows based on audio-derived semantics |
+| `llm_reduce` | âś… Full | Summarize or aggregate transcripts |
+| `llm_rerank` | âś… Via text | Rerank based on derived text features |
+| `llm_first` | âś… Via text | Pick top row based on transcript criteria |
+| `llm_last` | âś… Via text | Pick bottom row based on transcript criteria |
+| `llm_embedding` | âś… Via text | Embeddings over transcripts (not raw audio) |
+
+For image-specific workflows, see the [Image Support](/docs/image-support) page.
diff --git a/docs/docs/developer-guide.md b/docs/docs/developer-guide.md
new file mode 100644
index 00000000..9e44ecb9
--- /dev/null
+++ b/docs/docs/developer-guide.md
@@ -0,0 +1,119 @@
+---
+title: Developer Guide
+sidebar_position: 11
+---
+
+# Developer Guide
+
+This guide is for developers who want to build, extend, or contribute to Flock itself.
+
+import TOCInline from '@theme/TOCInline';
+
+
+
+## Local Development Setup
+
+- **Clone the repository**:
+
+```bash
+git clone --recursive https://github.com/dais-polymtl/flock.git
+cd flock
+```
+
+- **Initialize submodules** (if you forgot `--recursive`):
+
+```bash
+git submodule update --init --recursive
+```
+
+- **Build and run via helper script**:
+
+```bash
+./scripts/build_and_run.sh
+```
+
+The interactive script will:
+
+- Check for required tools (CMake, compiler, Ninja/Make, etc.).
+- Configure dependencies via `vcpkg`.
+- Build Flock (Debug/Release).
+- Launch DuckDB with the Flock extension preloaded.
+
+See the root `README.md` for a concise overview of these steps.
+
+## Project Structure (High Level)
+
+- **DuckDB engine**: Vendored under `duckdb/`, used as the host engine.
+- **Extension sources**:
+ - `src/functions/` – scalar and aggregate LLM functions and helpers.
+ - `src/model_manager/` – model registry, providers, and adapters.
+ - `src/prompt_manager/` – prompt management and storage.
+ - `src/metrics/` – LLM metrics collection and SQL API.
+- **Docs site**: Docusaurus site under `docs/` (this documentation).
+
+## Building the Extension Manually
+
+While `./scripts/build_and_run.sh` is the recommended path, you can also build manually:
+
+```bash
+mkdir -p build
+cd build
+cmake .. -G Ninja
+ninja
+```
+
+The resulting Flock extension library can then be loaded from DuckDB using `LOAD` with the appropriate path.
+
+## Running Tests
+
+Flock comes with both **unit** and **integration** tests:
+
+- C++ unit tests live under `test/unit/`.
+- Integration tests (Python + DuckDB) live under `test/integration/`.
+
+Example pattern (from the repo root):
+
+```bash
+python -m pytest test/integration
+```
+
+Check the repository’s CI configuration for the exact commands used in automation.
+
+## Coding Conventions
+
+When contributing code:
+
+- Follow the surrounding C++ style (namespaces, includes, brace style).
+- Avoid introducing new dependencies without a clear reason.
+- Prefer small, focused pull requests with clear descriptions.
+
+If in doubt, mirror patterns used in existing functions such as `llm_complete` or the metrics manager.
+
+## Working on Providers & Models
+
+- Provider-specific adapters live under `src/model_manager/providers/adapters/`.
+- HTTP and batching logic is centralized in provider handlers under
+ `src/include/flock/model_manager/providers/handlers/`.
+- New providers should:
+ - Integrate with the existing metrics API.
+ - Respect the `context_columns` abstraction.
+ - Provide clear, actionable error messages when a feature is unsupported.
+
+For examples, see the existing OpenAI, Azure, Ollama, and Anthropic adapters.
+
+## Docs & Developer Experience
+
+The Docusaurus docs live in `docs/`. To work on them:
+
+```bash
+cd docs
+npm install
+npm start
+```
+
+This runs the docs site locally with hot reload. When adding new features to Flock, prefer updating or extending:
+
+- `what-is-flock.md` for high-level positioning.
+- The relevant function or feature page (e.g., `image-support.md`, `audio-support.md`, `llm-metrics.md`).
+- This **Developer Guide** for build, testing, or contribution-related changes.
+
diff --git a/docs/docs/faq.mdx b/docs/docs/faq.mdx
index 365cf626..4191807a 100644
--- a/docs/docs/faq.mdx
+++ b/docs/docs/faq.mdx
@@ -1,8 +1,6 @@
---
title: Frequently Asked Questions (FAQ)
-sidebar_position: 10
----title: Frequently Asked Questions (FAQ)
-sidebar_position: 8
+sidebar_position: 11
---
import Collapse from '@site/src/components/global/Collapse';
diff --git a/docs/docs/getting-started/getting-started.md b/docs/docs/getting-started/getting-started.md
index 1387e376..bc2a8531 100644
--- a/docs/docs/getting-started/getting-started.md
+++ b/docs/docs/getting-started/getting-started.md
@@ -23,7 +23,7 @@ the [DuckDB CLI Overview](https://duckdb.org/docs/stable/clients/cli/overview.ht
## Install Flock Extension
-At this stage you should have a running DuckDB instance. Flock can be installed in two ways:
+At this stage you should have a running DuckDB instance (DuckDB **v1.4.4 or later**). Flock can be installed in two ways:
### Option 1: Install from Community Extension (Recommended)
@@ -80,7 +80,7 @@ If you want to build Flock from source or contribute to the project, you can use
## Set Up API Keys for Providers
To use Flock functions, you need to set up API keys for the providers you plan to use. Flock supports multiple providers
-such as **OpenAI**, **Azure**, and **Ollama**.
+such as **OpenAI**, **Azure**, **Ollama**, and **Anthropic/Claude**.
Refer to the following sections for detailed instructions on setting up API keys for each provider.
@@ -104,3 +104,7 @@ Icon={RiOpenaiFill}
title="OpenAI"
link="/flock/docs/getting-started/openai"
/>
+
diff --git a/docs/docs/image-support.md b/docs/docs/image-support.md
index a35e2a5b..b3b332e3 100644
--- a/docs/docs/image-support.md
+++ b/docs/docs/image-support.md
@@ -19,9 +19,11 @@ Flock's image support allows you to:
- Analyze and describe image content
- Filter records based on visual criteria
-- Generate embeddings for image similarity search
+- Generate embeddings for text derived from images
- Combine image and text data in a single query
-- Process images from URLs or file paths
+- Process images from URLs, file paths, or inline (base64) data
+
+For audio-specific workflows, see the [Audio Transcription](/docs/audio-support) page.
## Supported Image Formats
@@ -44,7 +46,7 @@ Images can be provided in different formats depending on your model provider:
- **Base64 encoded strings** (for inline image data)
-### Model-Specific Examples
+### Image Model-Specific Examples
**OpenAI with URL:**
@@ -82,12 +84,12 @@ To use images in Flock functions, specify the column type as `'image'` in the `c
### Basic Structure
-Each context column can have three properties:
+Each image context column can have the following properties:
- **`data`** _(required)_: The SQL column containing image data (URL, path, or base64)
- **`name`** _(optional)_: Custom name to reference this image in your prompt
-- **`type`** _(optional)_: Set to `'image'` for image data (default is `'tabular'`)
-- **`detail`** _(optional)_: Image detail level for OpenAI models - `'low'`, `'medium'`, or `'high'` (default is
+- **`type`** _(optional)_: Set to `'image'` (default is `'tabular'` if omitted)
+- **`detail`** _(optional)**:** Image detail level for OpenAI models – `'low'`, `'medium'`, or `'high'` (default is
`'low'`)
### Image Detail Parameter (OpenAI Only)
@@ -109,7 +111,7 @@ For OpenAI models, you can control the level of detail in image processing using
**Note**: The `detail` parameter only works with OpenAI vision models and is ignored by other providers.
-### Provider-Specific Examples
+### Provider-Specific Examples (Images)
**OpenAI Model with URL:**
diff --git a/docs/docs/llm-metrics.md b/docs/docs/llm-metrics.md
new file mode 100644
index 00000000..c3b9fafe
--- /dev/null
+++ b/docs/docs/llm-metrics.md
@@ -0,0 +1,121 @@
+---
+title: LLM Metrics & Observability
+sidebar_position: 8
+---
+
+# LLM Metrics & Observability
+
+Flock includes built-in observability for all LLM functions. You can inspect token usage, API latency, and execution time
+for `llm_complete`, `llm_filter`, `llm_embedding`, `llm_reduce`, `llm_rerank`, `llm_first`, and `llm_last` directly from
+SQL.
+
+import TOCInline from '@theme/TOCInline';
+
+
+
+## Overview
+
+Metrics are collected at the **database level** and aggregated across scalar and aggregate function calls. This allows
+you to answer questions like:
+
+- How many tokens did this query use?
+- Which models and providers are being called most often?
+- How much time is spent in the LLM API vs. local execution?
+
+All metrics are exposed as JSON via dedicated helper functions.
+
+## Core Functions
+
+Flock registers three scalar functions for metrics:
+
+- **`flock_get_metrics()`** – Returns a compact JSON summary of LLM usage.
+- **`flock_get_debug_metrics()`** – Returns a more verbose JSON payload, useful for debugging.
+- **`flock_reset_metrics()`** – Resets the in-memory metrics state and returns a confirmation message.
+
+### Basic Usage
+
+```sql
+-- Run some LLM queries
+SELECT llm_complete(
+ {'model_name': 'gpt-4o'},
+ {'prompt': 'Summarize this product.'},
+ {'product': product_name}
+ )
+FROM products
+LIMIT 10;
+
+-- Inspect aggregated LLM metrics
+SELECT flock_get_metrics() AS metrics;
+```
+
+Example JSON structure (simplified):
+
+```json
+{
+ "invocations": [
+ {
+ "function": "llm_complete",
+ "model_name": "gpt-4o",
+ "provider": "openai",
+ "input_tokens": 1234,
+ "output_tokens": 456,
+ "api_calls": 10,
+ "api_duration_us": 1234567,
+ "execution_time_us": 2345678
+ }
+ ]
+}
+```
+
+### Resetting Metrics
+
+Use `flock_reset_metrics()` to clear existing metrics before a new experiment or workload:
+
+```sql
+SELECT flock_reset_metrics() AS reset_result;
+SELECT flock_get_metrics() AS metrics;
+```
+
+## Query-Level Workflows
+
+Because metrics are stored at the database level, you can combine computation and inspection in the same script:
+
+```sql
+-- 1) Clear previous metrics
+SELECT flock_reset_metrics();
+
+-- 2) Run workload
+WITH sample AS (
+ SELECT *
+ FROM (VALUES
+ (1, 'Wireless Headphones'),
+ (2, 'Gaming Laptop'),
+ (3, 'Smart Watch')
+ ) AS t(product_id, product_name)
+)
+SELECT
+ product_id,
+ llm_complete(
+ {'model_name': 'gpt-4o'},
+ {'prompt': 'Write a short marketing blurb for {name}.', 'context_columns': [{'data': product_name, 'name': 'name'}]}
+ ) AS copy
+FROM sample;
+
+-- 3) Inspect metrics
+SELECT flock_get_metrics() AS metrics;
+```
+
+You can further parse the JSON using DuckDB's `JSON` extension to build dashboards or reports.
+
+## When to Use Metrics
+
+LLM metrics are particularly useful when you:
+
+- Benchmark different providers or models.
+- Tune prompts and batch sizes for cost/performance trade-offs.
+- Monitor token usage for budgeting and quota management.
+- Diagnose slow or unexpectedly expensive queries.
+
+By combining Flock’s LLM metrics with DuckDB’s analytics capabilities, you can build fully in-database observability
+for your semantic workloads.
+
diff --git a/docs/docs/model-parameters.md b/docs/docs/model-parameters.md
index 232e8b54..dd8dab17 100644
--- a/docs/docs/model-parameters.md
+++ b/docs/docs/model-parameters.md
@@ -1,6 +1,6 @@
---
title: Model Parameters
-sidebar_position: 6
+sidebar_position: 9
---
# Model Parameters in Flock
diff --git a/docs/docs/structured-output.md b/docs/docs/structured-output.md
index 5997549d..072a7527 100644
--- a/docs/docs/structured-output.md
+++ b/docs/docs/structured-output.md
@@ -1,8 +1,6 @@
---
title: Structured Output
-sidebar_position: 8
----title: Structured Output
-sidebar_position: 6
+sidebar_position: 10
---
# Structured Output in Flock
@@ -23,6 +21,13 @@ Instead of receiving free-form text, you can specify a JSON schema that the mode
**Compatibility**: Works with all Flock LLM functions - `llm_complete`, `llm_filter`, `llm_reduce`, `llm_rerank`,
`llm_first`, `llm_last`
+**Provider Support**:
+
+- **OpenAI** – native `response_format` with JSON schemas.
+- **Ollama** – `format` field with object schemas.
+- **Anthropic/Claude** – hybrid support via `output_format` (Claude 4.x) and `tool_use` (Claude 3.x). See the
+ [Anthropic guide](/docs/getting-started/anthropic) for details.
+
:::note Prerequisites
To extract values from structured JSON responses using dot notation (e.g., `response.category`), you need to load the
JSON extension:
diff --git a/docs/docs/what-is-flock.md b/docs/docs/what-is-flock.md
index 10db3c74..8c59e6ab 100644
--- a/docs/docs/what-is-flock.md
+++ b/docs/docs/what-is-flock.md
@@ -7,12 +7,26 @@ sidebar_position: 1
## Overview
**Flock** enhances DuckDB by integrating semantic functions and robust resource management capabilities, enabling
-advanced analytics and language model operations directly within SQL queries.
+advanced analytics and language model operations directly within SQL queries. It is distributed as a DuckDB extension
+that runs on native platforms and in the browser via DuckDB-WASM.
import TOCInline from '@theme/TOCInline';
+## Key Highlights (v0.4.0 and later)
+
+- **Four LLM Providers**: OpenAI, Azure, Ollama, and Anthropic/Claude, all integrated through a unified SQL API.
+- **Multimodal Support**: Text, image, and audio inputs (via transcription) using the same `context_columns` abstraction.
+- **LLM Metrics Tracking**: Built-in functions such as `flock_get_metrics()` expose token counts, latency, and call-level
+ metrics for all Flock LLM functions.
+- **Browser & WASM Support**: Flock can be compiled as a DuckDB-WASM loadable extension to run directly in the browser.
+- **Upgraded DuckDB Engine**: Based on DuckDB v1.4.4 for improved performance and stability.
+- **Architecture Improvements**: Centralized bind data, RAII-based storage guards, and performance fixes across scalar and
+ aggregate functions.
+- **Developer Experience**: Interactive build scripts and Copilot agent instructions streamline local development and CI
+ for the extension.
+
## Semantic Functions
Flock offers a suite of semantic functions that allow users to perform various language model operations:
@@ -67,7 +81,7 @@ to produce the best-fit results, and even create end-to-end RAG pipelines.
Flock provides [**structured output**](/docs/structured-output) capabilities that allow users to obtain predictable,
schema-compliant JSON responses from Large Language Models. This feature works with all Flock LLM functions and supports
-both OpenAI and Ollama providers, ensuring consistent data formats for downstream processing.
+OpenAI, Ollama, and Anthropic/Claude providers, ensuring consistent data formats for downstream processing.
## Resource Management
@@ -81,11 +95,12 @@ Flock is supported by the different operating systems and platforms, such as:
- Linux
- macOS
- Windows
+- Modern browsers via DuckDB-WASM
And to ensure stable and reliable performance, it is important to meet only two requirements:
-- **DuckDB Setup**: Version 1.1.1 or later. Flock is compatible with the latest stable release of DuckDB, which can be
- installed from the
- official [DuckDB installation guide](https://duckdb.org/docs/installation/index?version=stable&environment=cli&platform=linux&download_method=direct&architecture=x86_64).
-- **Provider API Key**: Flock supports multiple providers such as **OpenAI**, **Azure**, and **Ollama**. Configure the
- provider of your choice to get started.
+- **DuckDB Setup**: Version **1.4.4 or later**. Flock is compatible with the latest stable release of DuckDB, which can
+ be installed from the official
+ [DuckDB installation guide](https://duckdb.org/docs/installation/index?version=stable&environment=cli&platform=linux&download_method=direct&architecture=x86_64).
+- **Provider API Key**: Flock supports multiple providers such as **OpenAI**, **Azure**, **Ollama**, and
+ **Anthropic/Claude**. Configure the provider of your choice to get started.
diff --git a/scripts/build_and_run.sh b/scripts/build_and_run.sh
index bcae602c..013cdf02 100755
--- a/scripts/build_and_run.sh
+++ b/scripts/build_and_run.sh
@@ -266,12 +266,25 @@ if [ $CONFIGURE_RESULT -ne 0 ]; then
exit 1
fi
-# Build
+# Determine parallel build jobs
+NUM_CORES=4
+if command_exists nproc; then
+ NUM_CORES=$(nproc)
+elif command_exists sysctl; then
+ CORES=$(sysctl -n hw.ncpu 2>/dev/null || echo "")
+ if [ -n "$CORES" ]; then
+ NUM_CORES="$CORES"
+ fi
+fi
+
+print_info "Using up to $NUM_CORES parallel build jobs"
+
+# Build (multi-processing enabled via -j)
if [ "$BUILD_TYPE" = "debug" ]; then
- cmake --build build/debug --config Debug
+ cmake --build build/debug --config Debug -- -j"$NUM_CORES"
BUILD_RESULT=$?
else
- cmake --build build/release --config Release
+ cmake --build build/release --config Release -- -j"$NUM_CORES"
BUILD_RESULT=$?
fi
From 2a669c945415d338c6bf7fd248d081177f3e6899 Mon Sep 17 00:00:00 2001
From: Anas Dorbani <95044293+anasdorbani@users.noreply.github.com>
Date: Tue, 10 Mar 2026 16:13:04 -0400
Subject: [PATCH 2/6] Update documentation and examples in README and code
components (#251)
---
README.md | 11 ++--
docs/src/components/why-flock/CodeBox.tsx | 53 ++++++++++++-------
docs/src/components/why-flock/TableBox.tsx | 61 ++++++++++------------
docs/src/constants/index.ts | 6 +--
docs/src/pages/index.tsx | 3 +-
5 files changed, 73 insertions(+), 61 deletions(-)
diff --git a/README.md b/README.md
index a5f88134..09563fcc 100644
--- a/README.md
+++ b/README.md
@@ -48,6 +48,7 @@
Flock is an advanced **DuckDB** extension that seamlessly integrates analytics with semantic analysis through declarative SQL queries. Designed for modern data analysis needs, Flock empowers users to work with structured and unstructured data, combining OLAP workflows with the capabilities of **LLMs** (Large Language Models) and **RAG** (Retrieval-Augmented Generation) pipelines.
To cite the project:
+
```
@article{10.14778/3750601.3750685,
author = {Dorbani, Anas and Yasser, Sunny and Lin, Jimmy and Mhedhbi, Amine},
@@ -124,17 +125,20 @@ Flock is a **Community Extension** available directly from DuckDB's community ca
If you want to build Flock from source or contribute to the project, you can use our automated build script:
1. Clone the repository with submodules:
+
```bash
git clone --recursive https://github.com/dais-polymtl/flock.git
cd flock
```
-
+
Or if you've already cloned without submodules:
+
```bash
git submodule update --init --recursive
```
2. Run the build and run script:
+
```bash
./scripts/build_and_run.sh
```
@@ -150,6 +154,7 @@ If you want to build Flock from source or contribute to the project, you can use
3. The script will launch DuckDB with Flock extension ready to use. Make sure to check the [documentation](https://dais-polymtl.github.io/flock/docs/what-is-flock) for usage examples.
**Requirements for building from source:**
+
- CMake (3.5 or later)
- C++ compiler (GCC, Clang, or MSVC)
- Build system (Ninja or Make)
@@ -204,6 +209,6 @@ This project is licensed under the MIT License. See the [LICENSE](LICENSE) file
## ✨ Team
-This project is under active development by the [**Data & AI Systems Laboratory (DAIS Lab)**](https://github.com/dais-polymtl) at [**Polytechnique Montréal**](https://www.polymtl.ca/).
+This project is under active development by the [**Data & AI Systems Laboratory (DAIS Lab)**](https://github.com/dais-polymtl) at **Polytechnique Montréal**.
-🔝 back to top
\ No newline at end of file
+🔝 back to top
diff --git a/docs/src/components/why-flock/CodeBox.tsx b/docs/src/components/why-flock/CodeBox.tsx
index 93b67579..a5899b31 100644
--- a/docs/src/components/why-flock/CodeBox.tsx
+++ b/docs/src/components/why-flock/CodeBox.tsx
@@ -22,29 +22,42 @@ const CodeBox = () => {
9
10
11
+ 12
+ 13
+ 14
+ 15
+ 16
+ 17
-
-
- >{" "}
- SELECT
- {" "}paper_id,
- {" "}paper_title,
- {" "}llm_complete(
- {" "}{"{"}'model_name': 'gpt-4o'{"}"},
- {" "}{"{"}'prompt': 'Extract the top 3
keywords from the abstract.'{"}"},
- {" "}{"{"}'abstract': abstract{"}"}
- {" "}) AS extracted_keywords
- {" "}FROM research_papers
- {" "}WHERE publication_year > 2018
- {" "}LIMIT 5
- {" "};
-
-
+
+
+ -- 1) Create a model
+ CREATE MODEL(
+ {" "}'product_summarizer',
+ {" "}'gpt-4o',
+ {" "}'openai'
+ );
+
+ -- 2) Call it from SQL
+ SELECT
+ {" "}product_id,
+ {" "}name,
+ {" "}llm_complete(
+ {" "}{"{"}'model_name': 'product_summarizer'{"}"},
+ {" "}{"{"}
+ 'prompt': 'Summarize this product in one sentence.',
+ {" "}'context_columns': [{"{"}'data': name{"}"}]
+ {"}"}
+ {" "}) AS short_description
+ FROM products
+ LIMIT 3;
+
+
- )
-}
+ );
+};
-export default CodeBox
\ No newline at end of file
+export default CodeBox;
\ No newline at end of file
diff --git a/docs/src/components/why-flock/TableBox.tsx b/docs/src/components/why-flock/TableBox.tsx
index 92f0ca55..0a9aeded 100644
--- a/docs/src/components/why-flock/TableBox.tsx
+++ b/docs/src/components/why-flock/TableBox.tsx
@@ -1,37 +1,32 @@
const TableBox = () => {
return (
-
-
- | ID |
- Paper Title |
- Extracted Keywords |
-
-
-
-
- | 1 |
- Innovations in Biotechnology |
- Biotechnology, Genetic Engineering, CRISPR |
-
-
- | 2 |
- The Rise of Autonomous Vehicles |
- Autonomous Vehicles, AI, Self-driving Cars |
-
-
- | 3 |
- Exploring Renewable Energy Solutions |
- Renewable Energy, Solar Power, Wind Turbines |
-
-
- | 4 |
- Understanding Blockchain Technology |
- Blockchain, Cryptocurrencies, Decentralization |
-
-
-
- )
-}
+
+
+ | ID |
+ Product |
+ Short Description (from LLM) |
+
+
+
+
+ | 1 |
+ Wireless Headphones |
+ Comfortable Bluetooth headphones with clear sound and long battery life. |
+
+
+ | 2 |
+ Gaming Laptop |
+ High‑performance laptop designed for modern games and creative workloads. |
+
+
+ | 3 |
+ Smart Watch |
+ Everyday smartwatch that tracks activity, notifications, and heart rate. |
+
+
+
+ );
+};
-export default TableBox
\ No newline at end of file
+export default TableBox;
\ No newline at end of file
diff --git a/docs/src/constants/index.ts b/docs/src/constants/index.ts
index ed90e351..b50cdbfb 100644
--- a/docs/src/constants/index.ts
+++ b/docs/src/constants/index.ts
@@ -49,15 +49,15 @@ export const features = [
export const whyFlock = [
{
id: "why-flock-1",
- content: "Declarative LLM integration within SQL",
+ content: "Create reusable models and prompts once, then call them from simple SQL.",
},
{
id: "why-flock-2",
- content: "Transparently adds lower-level optimizations, e.g., batching and caching",
+ content: "Run LLM workloads next to your data without custom services or glue code.",
},
{
id: "why-flock-3",
- content: "Add tabular understanding capabilities to your relational databases",
+ content: "Keep results in tables you can join, filter, and visualize with regular DuckDB queries.",
}
];
diff --git a/docs/src/pages/index.tsx b/docs/src/pages/index.tsx
index b1625ecb..17fac8f3 100644
--- a/docs/src/pages/index.tsx
+++ b/docs/src/pages/index.tsx
@@ -1,6 +1,6 @@
import BrowserOnly from '@docusaurus/BrowserOnly';
import styles from "@site/src/css/style";
-import {Navbar, Hero, Features, WhyFlock, GettingStarted, CTA, Footer, Team} from "@site/src/components";
+import {Navbar, Hero, Features, WhyFlock, GettingStarted, CTA, Footer} from "@site/src/components";
const Home: React.FC = () => {
return (
@@ -23,7 +23,6 @@ const Home: React.FC = () => {
-
From adbf0d4dcad4aa7579aa91ca0f42bcee24900ad3 Mon Sep 17 00:00:00 2001
From: Anas Dorbani <95044293+anasdorbani@users.noreply.github.com>
Date: Tue, 10 Mar 2026 16:37:25 -0400
Subject: [PATCH 3/6] Enhance DocCard component to support optional icons and
add Anthropic icon to documentation. (#253)
---
docs/docs/getting-started/getting-started.md | 3 ++-
docs/src/components/global/DocCard.tsx | 18 ++++++++++++++----
2 files changed, 16 insertions(+), 5 deletions(-)
diff --git a/docs/docs/getting-started/getting-started.md b/docs/docs/getting-started/getting-started.md
index bc2a8531..ebe2b657 100644
--- a/docs/docs/getting-started/getting-started.md
+++ b/docs/docs/getting-started/getting-started.md
@@ -87,7 +87,7 @@ Refer to the following sections for detailed instructions on setting up API keys
import DocCard from '@site/src/components/global/DocCard';
import { RiOpenaiFill } from "react-icons/ri";
import { VscAzure } from "react-icons/vsc";
-import { SiOllama } from "react-icons/si";
+import { SiOllama, SiAnthropic } from "react-icons/si";
diff --git a/docs/src/components/global/DocCard.tsx b/docs/src/components/global/DocCard.tsx
index 02570da6..f55c2499 100644
--- a/docs/src/components/global/DocCard.tsx
+++ b/docs/src/components/global/DocCard.tsx
@@ -1,14 +1,24 @@
import React from "react";
import type { IconType } from "react-icons/lib";
-export default function DocCard(props: { Icon: IconType; title: string; link: string }) {
+type DocCardProps = {
+ Icon?: IconType;
+ title: string;
+ link: string;
+};
+
+export default function DocCard(props: DocCardProps) {
const { Icon, title, link } = props;
return (
<>
- window.open(link, "_self")}
- className="my-2 flex gap-2 cursor-pointer rounded-2xl items-center text-lg font-bold border-solid border-[1px] border-[#FF9128] p-4 hover:shadow-[0_0_10px_#FF9128] hover:shadow-orange-500/50 transition-all duration-300 ease-in-out">
-
+
window.open(link, "_self")}
+ className="my-2 flex gap-2 cursor-pointer rounded-2xl items-center text-lg font-bold border-solid border-[1px] border-[#FF9128] p-4 hover:shadow-[0_0_10px_#FF9128] hover:shadow-orange-500/50 transition-all duration-300 ease-in-out"
+ >
+ {Icon && (
+
+ )}
{title}
>
From 7cfcb4eabe4ac9756cc348848a5e5a8ba8ca5784 Mon Sep 17 00:00:00 2001
From: Anas Dorbani
Date: Tue, 7 Apr 2026 08:58:24 -0400
Subject: [PATCH 4/6] Upgrade DuckDB and extension CI tools to version 1.5.0
---
.github/copilot-instructions.md | 4 ++--
.github/workflows/MainDistributionPipeline.yml | 14 +++++++-------
README.md | 4 ++--
duckdb | 2 +-
extension-ci-tools | 2 +-
5 files changed, 13 insertions(+), 13 deletions(-)
diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
index 81bdfee9..9acbbbd6 100644
--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@@ -8,7 +8,7 @@
- **Build system**: CMake (3.5+) with DuckDB's extension CI tools (`extension-ci-tools/`)
- **Dependency manager**: vcpkg (managed via `vcpkg.json`)
- **Key dependencies**: `nlohmann-json`, `curl`, `gtest` (see `vcpkg.json`)
-- **DuckDB version targeted**: v1.4.4 (see `MainDistributionPipeline.yml`)
+- **DuckDB version targeted**: v1.5.0 (see `MainDistributionPipeline.yml`)
## Repository Layout
@@ -112,7 +112,7 @@ Always run `clang-format` on modified C++ files before committing. The CI pipeli
Defined in `.github/workflows/MainDistributionPipeline.yml`:
-- **duckdb-stable-build**: Builds extension binaries for all platforms using DuckDB v1.4.4 CI tools.
+- **duckdb-stable-build**: Builds extension binaries for all platforms using DuckDB v1.5.0 CI tools.
- **code-quality-check**: Runs `clang-format` and `clang-tidy` checks.
Triggered on push to `main`/`dev` when `src/`, `test/`, `CMakeLists.txt`, or workflow files change, and on `workflow_dispatch`.
diff --git a/.github/workflows/MainDistributionPipeline.yml b/.github/workflows/MainDistributionPipeline.yml
index 9f5749d6..831abaa3 100644
--- a/.github/workflows/MainDistributionPipeline.yml
+++ b/.github/workflows/MainDistributionPipeline.yml
@@ -24,17 +24,17 @@ concurrency:
jobs:
duckdb-stable-build:
name: Build extension binaries
- uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@v1.4.4
+ uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@v1.5.0
with:
- duckdb_version: v1.4.4
- ci_tools_version: v1.4.4
+ duckdb_version: v1.5.0
+ ci_tools_version: v1.5.0
extension_name: flock
code-quality-check:
name: Code Quality Check
- uses: duckdb/extension-ci-tools/.github/workflows/_extension_code_quality.yml@v1.4.4
+ uses: duckdb/extension-ci-tools/.github/workflows/_extension_code_quality.yml@v1.5.0
with:
- duckdb_version: v1.4.4
- ci_tools_version: v1.4.4
+ duckdb_version: v1.5.0
+ ci_tools_version: v1.5.0
extension_name: flock
- format_checks: 'format;tidy'
+ format_checks: "format;tidy"
diff --git a/README.md b/README.md
index 09563fcc..e0147437 100644
--- a/README.md
+++ b/README.md
@@ -80,7 +80,7 @@ To cite the project:
- **WASM Support**: Compile Flock as a DuckDB-WASM loadable extension to run in the browser, enabling client-side analytics and demos without server infrastructure.
- **LLM Metrics Tracking**: Track token usage, API latency, and execution time through dedicated functions like `flock_get_metrics()` for better cost and performance monitoring.
- **Audio Transcription**: Send audio inputs to OpenAI or Azure and obtain text transcripts using the same `context_columns` abstraction (with `type: 'audio'`).
-- **DuckDB v1.4.4**: Upgraded to DuckDB **1.4.4**, inheriting the latest performance and stability improvements.
+- **DuckDB v1.5.0**: Upgraded to DuckDB **1.5.0**, inheriting the latest performance and stability improvements.
- **Architecture Improvements**: Centralized bind data and RAII-based storage guards reduce duplication and improve robustness across scalar and aggregate functions.
- **Developer Experience**: Interactive build scripts, improved extension CI tooling, and GitHub Copilot agent instructions streamline local development and contributions.
@@ -90,7 +90,7 @@ To cite the project:
### 📝 Prerequisites
-1. **DuckDB**: Version **1.4.4 or later**. Install it from the official [DuckDB installation guide](https://duckdb.org/docs/installation/).
+1. **DuckDB**: Version **1.5.0 or later**. Install it from the official [DuckDB installation guide](https://duckdb.org/docs/installation/).
2. **Supported Providers**: Ensure you have credentials or API keys for at least one of the supported providers:
- OpenAI
- Azure
diff --git a/duckdb b/duckdb
index 6ddac802..3a3967aa 160000
--- a/duckdb
+++ b/duckdb
@@ -1 +1 @@
-Subproject commit 6ddac802ffa9bcfbcc3f5f0d71de5dff9b0bc250
+Subproject commit 3a3967aa8190d0a2d1931d4ca4f5d920760030b4
diff --git a/extension-ci-tools b/extension-ci-tools
index 86fa59ca..02fb3fd3 160000
--- a/extension-ci-tools
+++ b/extension-ci-tools
@@ -1 +1 @@
-Subproject commit 86fa59ca22c3f5dcbe7e1d17aea6b79c97cb3616
+Subproject commit 02fb3fd377ba6c46d61b1163413961558cecf5a3
From 05f4f4edbecd3249d294ce2829bdb49d560a54d9 Mon Sep 17 00:00:00 2001
From: Anas Dorbani <95044293+anasdorbani@users.noreply.github.com>
Date: Wed, 8 Apr 2026 15:19:07 -0400
Subject: [PATCH 5/6] Refactor flock extension to use extension registration
APIs (#259)
---
CMakeLists.txt | 13 -------------
extension_config.cmake | 6 ++++--
src/core/config/model.cpp | 2 --
src/flock_extension.cpp | 6 +++---
.../scalar/llm_filter/implementation.cpp | 15 ++++++++++-----
src/include/flock_extension.hpp | 2 ++
test/unit/CMakeLists.txt | 4 ++++
7 files changed, 23 insertions(+), 25 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 383772e3..83efadaa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,19 +31,6 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION
target_link_libraries(${EXTENSION_NAME} -lstdc++fs)
endif()
-# Check if we're in debug mode and enable AddressSanitizer
-if(CMAKE_BUILD_TYPE STREQUAL "Debug")
- message(STATUS "Enabling AddressSanitizer for Debug build")
- # Enable AddressSanitizer
- target_compile_options(${EXTENSION_NAME} PRIVATE -fsanitize=address
- -fno-omit-frame-pointer)
- target_link_options(${EXTENSION_NAME} PRIVATE -fsanitize=address)
-
- target_compile_options(${LOADABLE_EXTENSION_NAME}
- PRIVATE -fsanitize=address -fno-omit-frame-pointer)
- target_link_options(${LOADABLE_EXTENSION_NAME} PRIVATE -fsanitize=address)
-endif()
-
# Link libraries for the static extension
if(NOT EMSCRIPTEN)
target_link_libraries(${EXTENSION_NAME} CURL::libcurl)
diff --git a/extension_config.cmake b/extension_config.cmake
index 3205e38b..46dcd65e 100644
--- a/extension_config.cmake
+++ b/extension_config.cmake
@@ -1,7 +1,9 @@
# This file is included by DuckDB's build system. It specifies which extension
# to load
+# Ensure dependencies are loaded before flock bootstraps config
+duckdb_extension_load(core_functions)
+duckdb_extension_load(json)
+
# Extension from this repo
duckdb_extension_load(flock SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR} LOAD_TESTS)
-
-# Any extra extensions that should be built e.g.: duckdb_extension_load(json)
diff --git a/src/core/config/model.cpp b/src/core/config/model.cpp
index b85d558e..eda6e989 100644
--- a/src/core/config/model.cpp
+++ b/src/core/config/model.cpp
@@ -9,7 +9,6 @@ std::string Config::get_user_defined_models_table_name() { return "FLOCKMTL_MODE
void Config::SetupDefaultModelsConfig(duckdb::Connection& con, std::string& schema_name) {
const std::string table_name = Config::get_default_models_table_name();
- con.Query("INSTALL JSON; LOAD JSON;");
con.Query(duckdb_fmt::format(" CREATE TABLE IF NOT EXISTS {}.{} ( "
" model_name VARCHAR NOT NULL PRIMARY KEY, "
" model VARCHAR NOT NULL, "
@@ -33,7 +32,6 @@ void Config::SetupDefaultModelsConfig(duckdb::Connection& con, std::string& sche
void Config::SetupUserDefinedModelsConfig(duckdb::Connection& con, std::string& schema_name) {
const std::string table_name = Config::get_user_defined_models_table_name();
- con.Query("INSTALL JSON; LOAD JSON;");
con.Query(duckdb_fmt::format(" CREATE TABLE IF NOT EXISTS {}.{} ( "
" model_name VARCHAR NOT NULL PRIMARY KEY, "
" model VARCHAR NOT NULL, "
diff --git a/src/flock_extension.cpp b/src/flock_extension.cpp
index faef6dc4..f2f21d0e 100644
--- a/src/flock_extension.cpp
+++ b/src/flock_extension.cpp
@@ -15,11 +15,11 @@ namespace duckdb {
static void LoadInternal(ExtensionLoader& loader) {
flock::Config::Configure(loader);
- // Register the custom parser
+ // Register parser and binder hooks using extension registration APIs.
auto& config = DBConfig::GetConfig(loader.GetDatabaseInstance());
DuckParserExtension duck_parser;
- config.parser_extensions.push_back(duck_parser);
- config.operator_extensions.push_back(make_uniq());
+ ParserExtension::Register(config, duck_parser);
+ OperatorExtension::Register(config, make_shared_ptr());
}
ParserExtensionParseResult duck_parse(ParserExtensionInfo*, const std::string& query) {
diff --git a/src/functions/scalar/llm_filter/implementation.cpp b/src/functions/scalar/llm_filter/implementation.cpp
index e0a4419f..363e7829 100644
--- a/src/functions/scalar/llm_filter/implementation.cpp
+++ b/src/functions/scalar/llm_filter/implementation.cpp
@@ -84,11 +84,16 @@ void LlmFilter::Execute(duckdb::DataChunk& args, duckdb::ExpressionState& state,
auto& func_expr = state.expr.Cast();
auto* bind_data = &func_expr.bind_info->Cast();
- const auto results = LlmFilter::Operation(args, bind_data);
-
- auto index = 0;
- for (const auto& res: results) {
- result.SetValue(index++, duckdb::Value(res));
+ if (const auto results = LlmFilter::Operation(args, bind_data); static_cast(results.size()) == 1) {
+ auto empty_vec = duckdb::Vector(std::string());
+ duckdb::UnaryExecutor::Execute(
+ empty_vec, result, args.size(),
+ [&](duckdb::string_t name) { return duckdb::StringVector::AddString(result, results[0]); });
+ } else {
+ auto index = 0;
+ for (const auto& res: results) {
+ result.SetValue(index++, duckdb::Value(res));
+ }
}
auto exec_end = std::chrono::high_resolution_clock::now();
diff --git a/src/include/flock_extension.hpp b/src/include/flock_extension.hpp
index 4c098257..f8ce44d1 100644
--- a/src/include/flock_extension.hpp
+++ b/src/include/flock_extension.hpp
@@ -1,6 +1,8 @@
#pragma once
#include "flock/core/common.hpp"
+#include "duckdb/parser/parser_extension.hpp"
+#include "duckdb/planner/operator_extension.hpp"
namespace duckdb {
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 0bfbc02f..0c9e5586 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -8,6 +8,10 @@ file(COPY unit_test.db DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
add_executable(${PROJECT_NAME}_tests test_main.cpp ${TEST_SOURCES})
target_link_libraries(${PROJECT_NAME}_tests PRIVATE ${PROJECT_NAME}_extension
+ duckdb_generated_extension_loader
+ core_functions_extension
+ parquet_extension
+ json_extension
GTest::gtest GTest::gmock)
add_test(AllTestsInMain ${PROJECT_NAME}_tests)
From dd038e131ac6818f10a93e3e2941ff3600d2a243 Mon Sep 17 00:00:00 2001
From: Anas Dorbani <95044293+anasdorbani@users.noreply.github.com>
Date: Thu, 9 Apr 2026 16:42:00 -0400
Subject: [PATCH 6/6] Fixed the test build on linux (#260)
---
.../workflows/MainDistributionPipeline.yml | 2 +-
CMakeLists.txt | 15 ++++++------
test/unit/CMakeLists.txt | 24 ++++++++++++-------
3 files changed, 24 insertions(+), 17 deletions(-)
diff --git a/.github/workflows/MainDistributionPipeline.yml b/.github/workflows/MainDistributionPipeline.yml
index 831abaa3..f7d47247 100644
--- a/.github/workflows/MainDistributionPipeline.yml
+++ b/.github/workflows/MainDistributionPipeline.yml
@@ -37,4 +37,4 @@ jobs:
duckdb_version: v1.5.0
ci_tools_version: v1.5.0
extension_name: flock
- format_checks: "format;tidy"
+ format_checks: "format;tidy"
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 83efadaa..633a0583 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -21,6 +21,13 @@ if(NOT EMSCRIPTEN)
endif()
find_package(nlohmann_json CONFIG REQUIRED)
+# Coverage instrumentation (must be before targets are built)
+if(CMAKE_BUILD_TYPE STREQUAL "Coverage")
+ message(STATUS "Enabling code coverage")
+ add_compile_options(-fprofile-instr-generate -fcoverage-mapping)
+ add_link_options(-fprofile-instr-generate -fcoverage-mapping)
+endif()
+
# Build the DuckDB static and loadable extensions
build_static_extension(${TARGET_NAME} ${EXTENSION_SOURCES})
build_loadable_extension(${TARGET_NAME} " " ${EXTENSION_SOURCES})
@@ -43,8 +50,6 @@ if(NOT EMSCRIPTEN)
endif()
target_link_libraries(${LOADABLE_EXTENSION_NAME} nlohmann_json::nlohmann_json)
-# WASM builds use EM_JS with synchronous XMLHttpRequest for HTTP
-
# Install the extension
install(
TARGETS ${EXTENSION_NAME}
@@ -52,12 +57,6 @@ install(
LIBRARY DESTINATION "${INSTALL_LIB_DIR}"
ARCHIVE DESTINATION "${INSTALL_LIB_DIR}")
-if(CMAKE_BUILD_TYPE STREQUAL "Coverage")
- message(STATUS "Enabling code coverage for Debug build")
- add_compile_options(-fprofile-instr-generate -fcoverage-mapping)
- add_link_options(-fprofile-instr-generate -fcoverage-mapping)
-endif()
-
if(NOT EMSCRIPTEN)
# Add the test directory if not on WASM
enable_testing()
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 0c9e5586..53b53570 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -1,17 +1,25 @@
find_package(GTest CONFIG REQUIRED)
-file(GLOB_RECURSE TEST_SOURCES *.cpp)
-list(REMOVE_ITEM TEST_SOURCES "test_main.cpp")
+file(GLOB_RECURSE TEST_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
+list(FILTER TEST_SOURCES EXCLUDE REGEX ".*test_main\\.cpp$")
file(COPY unit_test.db DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
add_executable(${PROJECT_NAME}_tests test_main.cpp ${TEST_SOURCES})
-target_link_libraries(${PROJECT_NAME}_tests PRIVATE ${PROJECT_NAME}_extension
- duckdb_generated_extension_loader
- core_functions_extension
- parquet_extension
- json_extension
- GTest::gtest GTest::gmock)
+# GNU ld (Linux + MinGW) needs --start-group/--end-group for circular deps
+# jemalloc is only built on Linux
+target_link_libraries(${PROJECT_NAME}_tests PRIVATE
+ GTest::gtest
+ GTest::gmock
+ $<$:-Wl,--start-group>
+ ${PROJECT_NAME}_extension
+ duckdb_generated_extension_loader
+ core_functions_extension
+ json_extension
+ parquet_extension
+ $<$:jemalloc_extension>
+ duckdb_static
+ $<$:-Wl,--end-group>)
add_test(AllTestsInMain ${PROJECT_NAME}_tests)