Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
295bf64
llama integration with exception safety
reeshabh90 Feb 18, 2026
430c854
PR review suggested changes
reeshabh90 Feb 20, 2026
63546cc
renaming model-runner to c2t_runner
reeshabh90 Feb 24, 2026
04443d2
- llama-embedding integration with local-ai::embed.
reeshabh90 Feb 24, 2026
ce6c8ba
PR Suggested changes <minor fixes>
reeshabh90 Feb 27, 2026
4048bb7
This commit includes following major changes:
reeshabh90 Mar 8, 2026
f591e3f
minor - added documentation
reeshabh90 Mar 8, 2026
8d75dd4
PR review suggested changes and necessary documentations.
reeshabh90 Mar 12, 2026
f1269f9
Removing destructor as it is not doing anything user provided
reeshabh90 Mar 13, 2026
12c195a
Noticed one bug, hence fixed. Variable was getting shadowed.
reeshabh90 Mar 13, 2026
b9e36fe
Renaming c2t_runner to ct2_runner
reeshabh90 Mar 16, 2026
fd4e8a7
VCPKG Features introduction for custom installation of AI specific
reeshabh90 Mar 17, 2026
51412ae
Feature based AI capability installation via VCPKG.
reeshabh90 Mar 21, 2026
535b884
Merge tag '2026.03.26' into llama-integration
reeshabh90 Mar 30, 2026
5e7068f
Qwen port package renaming
reeshabh90 Mar 30, 2026
6bcba54
changes relate to test execution for build
reeshabh90 Mar 31, 2026
937b6d3
Docwire CLI feature customization code for correct build process
reeshabh90 Apr 1, 2026
61b6b0f
Attempt to resolve tokenizer usage based on Docwire Local CT2 flag in
reeshabh90 Apr 1, 2026
f55e5ab
Inclusion of headers in docwire.h based on Local CT2 and LLama flags
reeshabh90 Apr 1, 2026
eed0fda
re-design of build architecture for local ai usage, and for efficient
reeshabh90 Apr 2, 2026
ea10198
making API changes in integration test
reeshabh90 Apr 2, 2026
19b79a6
a small fix for installation of docwire_local_ai INTERFACE
reeshabh90 Apr 3, 2026
2c42539
llama.cpp integration of chat template for inference
reeshabh90 Apr 16, 2026
2e07529
Adding/restoring integration example for ct2
reeshabh90 Apr 18, 2026
17f5457
Build Architecture revamp.
reeshabh90 Apr 21, 2026
53760c7
newly added files
reeshabh90 Apr 21, 2026
b38aa2d
Updated code changes based on feedback
reeshabh90 May 9, 2026
183b577
Changes related to embed implementation with default prefixes
reeshabh90 May 21, 2026
89da06d
minor update related to removing e5 specific default.
reeshabh90 May 21, 2026
b80e807
Minor code changes based on review suggestions
reeshabh90 May 26, 2026
e8a0078
embed namespace naming convention changes
reeshabh90 Jun 4, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,5 @@ version.h
ports/docwire/.disable_binary_cache
.cache
compile_commands.json
.zed
.clang-format
3 changes: 3 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ cmake_minimum_required(VERSION 3.17)

set(CMAKE_EXPORT_COMPILE_COMMANDS ON) # for VSCode CLangd extension

option(DOCWIRE_LOCAL_CT2 "Enable local AI (Translation / Text Generation / Embedding models)" OFF)
Comment thread
reeshabh90 marked this conversation as resolved.
option(DOCWIRE_LLAMA "Enable llama.cpp engine" OFF)

# Get version from ChangeLog.md and store it in DOCWIRE_VERSION and SIMPLE_DOCWIRE_VERSION
function(extract_version)
file(READ "${CMAKE_CURRENT_SOURCE_DIR}/doc/ChangeLog.md" changelog_text)
Expand Down
14 changes: 7 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,7 @@ std::filesystem::path("test.zip") | content_type::detector{} | archives_parser{}
Classify file in any format (Office, PDF, mail, etc) to any categories using build-in local AI model:

```cpp
std::filesystem::path("...") | ... | local_ai::model_chain_element("Classify to...: agreement, invoice, report...") | out_stream;
std::filesystem::path("...") | ... | ai::local::task("Classify to...: agreement, invoice, report...") | out_stream;
ensure(out_stream.str()) == "report";
```
[Full example](https://docwire.readthedocs.io/en/latest/local_ai_classify_8cpp-example.html)
Expand All @@ -381,7 +381,7 @@ ensure(out_stream.str()) == "report\n";
Translate document in any format (Office, PDF, mail, etc) to other language using build-in local AI model:

```cpp
std::filesystem::path("...") | ... | local_ai::model_chain_element("Translate to spanish:\n\n") | out_stream;
std::filesystem::path("...") | ... | ai::local::translate("spanish") | out_stream;
ensure(fuzzy_match::ratio(out_stream.str(), "La procesación de datos se refiere a las actividades...")) > 80;
```
[Full example](https://docwire.readthedocs.io/en/latest/local_ai_translate_8cpp-example.html)
Expand All @@ -397,7 +397,7 @@ ensure(fuzzy_match::ratio(out_stream.str(), "El procesamiento de datos se refier
Detect sentiment of document in any format (Office, PDF, mail, etc) using build-in local AI model:

```cpp
std::filesystem::path("...") | ... | local_ai::model_chain_element("Detect sentiment:\n\n") | out_stream;
std::filesystem::path("...") | ... | ai::local::task("Detect sentiment:\n\n") | out_stream;
ensure(out_stream.str()) == "positive";
```
[Full example](https://docwire.readthedocs.io/en/latest/local_ai_sentiment_8cpp-example.html)
Expand All @@ -412,7 +412,7 @@ std::filesystem::path("1.doc") | ... | openai::DetectSentiment(...) | std::cout;
Make a summary of document in any format (Office, PDF, mail, etc) using build-in local AI model:

```cpp
std::filesystem::path("...") | ... | local_ai::model_chain_element("Write a short summary...") | out_stream;
std::filesystem::path("...") | ... | ai::local::summarize() | out_stream;
ensure(out_stream.str()).is_one_of({ "Data processing is the collection, organization, analysis, and interpretation of data to extract useful insights and support decision-making."...
```
[Full example](https://docwire.readthedocs.io/en/latest/local_ai_summary_8cpp-example.html)
Expand All @@ -435,7 +435,7 @@ ensure(fuzzy_match::ratio(out_stream.str(), "Data processing involves converting
Find phrases, objects and events with smart matching in documents in any format (Office, PDF, mail, etc) using build-in local AI model:

```cpp
std::filesystem::path("...") | ... | local_ai::model_chain_element("Find sentence about \"data conversion\"...") | out_stream;
std::filesystem::path("...") | ... | ai::local::task("Find sentence about \"data conversion\"...") | out_stream;
ensure(out_stream.str()).is_one_of({ "Data processing refers to the activities performed on raw data to convert it into meaningful information."...
```
[Full example](https://docwire.readthedocs.io/en/latest/local_ai_find_8cpp-example.html)
Expand All @@ -461,9 +461,9 @@ ensure(out_msgs[0]->get<ai::embedding>().values.size()) == 1536;
Create embedding for document in any format (Office, PDF, mail, etc) using build-in local AI model, create embeddings for two queries and calculate similarity:

```cpp
std::filesystem::path("data_processing_definition.doc") | ... | local_ai::embed(local_ai::embed::e5_passage_prefix) | passage_msgs;
std::filesystem::path("data_processing_definition.doc") | ... | ai::local::passage::embedder{} | passage_msgs;
...
docwire::data_source{std::string{"What is data processing?"}, ...} | local_ai::embed(local_ai::embed::e5_query_prefix) | similar_query_msgs;
docwire::data_source{std::string{"What is data processing?"}, ...} | ai::local::query::embedder{} | similar_query_msgs;
...
double sim = cosine_similarity(passage_embedding.values, similar_query_embedding.values);
...
Expand Down
47 changes: 47 additions & 0 deletions build_demo.sh
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be better to modify/parametrize build.sh/build.ps1 instead of creating another script. If this is only for you that it should not be added to the repository.

Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/usr/bin/env bash
set -e # stop on first error

# ==========================
# CONFIGURATION — adjust paths
# ==========================
VCPKG_TOOLCHAIN=/home/reeshabh/coderepos/docwire/vcpkg/scripts/buildsystems/vcpkg.cmake
VCPKG_TRIPLET=x64-linux-dynamic
DOCWIRE_DIR=/home/reeshabh/coderepos/docwire/vcpkg/installed/x64-linux-dynamic/share/docwire
BUILD_DIR=./build
DEMO_EXEC=demo


# ==========================
# CLEAN BUILD
# ==========================
echo "Cleaning old build folder..."
rm -rf "$BUILD_DIR"
mkdir "$BUILD_DIR"
cd "$BUILD_DIR"

# ==========================
# CONFIGURE CMAKE
# ==========================
echo "Configuring CMake..."
cmake .. \
-DCMAKE_TOOLCHAIN_FILE="$VCPKG_TOOLCHAIN" \
-DVCPKG_TARGET_TRIPLET="$VCPKG_TRIPLET" \
-Ddocwire_DIR="$DOCWIRE_DIR" \
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON

# ==========================
# BUILD
# ==========================
echo "Building project..."
cmake --build .

# ==========================
# SET LIBRARY PATH
# ==========================
export LD_LIBRARY_PATH=/home/reeshabh/coderepos/docwire/vcpkg/installed/x64-linux-dynamic/lib:$LD_LIBRARY_PATH
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

RPATH is better on Linux/MacOS than LD_LIBRARY_PATH
https://en.wikipedia.org/wiki/Rpath


# ==========================
# RUN DEMO
# ==========================
# echo "Running demo..."
# ./$DEMO_EXEC
45 changes: 45 additions & 0 deletions download_llama_model.sh
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shell script is not good because of Windows. You would have to have two scripts: sh and ps1. What should be considered is downloading in CMake script (FetchContent etc) or using vcpkg ports. Currently we use vcpkg but this is open for discussion (can change in the future).

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed, we will discuss, whether we auto download via vcpkg or let the user manage the model themselves.

Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/usr/bin/env bash
set -e

# Configurable defaults

MODEL_NAME="${MODEL_NAME:-qwen2-7b-instruct}"
MODEL_QUANT="${MODEL_QUANT:-q4_k_m}"
MODEL_REPO="${MODEL_REPO:-Qwen/Qwen2-7B-Instruct-GGUF}"

# Derived values
MODEL_FILE="${MODEL_NAME}.${MODEL_QUANT}.gguf"
OUTPUT_DIR="${OUTPUT_DIR:-models}"
HF_URL="https://huggingface.co/${MODEL_REPO}/resolve/main/${MODEL_FILE}"

# Checks
if ! command -v wget &> /dev/null && ! command -v curl &> /dev/null; then
echo "Error: Neither wget nor curl is installed."
exit 1
fi

mkdir -p "${OUTPUT_DIR}"
cd "${OUTPUT_DIR}"

if [ -f "${MODEL_FILE}" ]; then
echo "Model already exists: ${OUTPUT_DIR}/${MODEL_FILE}"
exit 0
fi

echo "Downloading model:"
echo " Repository : ${MODEL_REPO}"
echo " File : ${MODEL_FILE}"
echo " Destination: ${OUTPUT_DIR}"
echo ""

# Download

if command -v wget &> /dev/null; then
wget -c "${HF_URL}"
else
curl -L -C - -o "${MODEL_FILE}" "${HF_URL}"
fi

echo ""
echo "Download complete."
echo "Model saved to: ${OUTPUT_DIR}/${MODEL_FILE}"
2 changes: 2 additions & 0 deletions ports/docwire/portfile.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ vcpkg_check_features(OUT_FEATURE_OPTIONS FEATURE_OPTIONS
asan ADDRESS_SANITIZER
tsan THREAD_SANITIZER
helgrind HELGRIND_ENABLED
local-ai DOCWIRE_LOCAL_CT2
llama-engine DOCWIRE_LLAMA
)

if(DEFINED ENV{CMAKE_MESSAGE_LOG_LEVEL})
Expand Down
38 changes: 26 additions & 12 deletions ports/docwire/vcpkg.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,32 @@
"callgrind":
{
"description": "Enable valgrind callgrind in automatic tests"
},
"local-ai":
{
"description": "Enable local AI runtime",
"dependencies": [
"ctranslate2",
Comment thread
reeshabh90 marked this conversation as resolved.
"sentencepiece",
"multilingual-e5-small-ct2-int8",
"flan-t5-large-ct2-int8"
]
},
"llama-engine":
{
"description": "Enable GGUF-based LLM inference (llama.cpp)",
"dependencies": [
{ "name": "docwire", "features": ["local-ai"] },
"llama-cpp"
]
},
"llama-qwen":
{
"description": "Install Qwen2 7B GGUF model",
"dependencies": [
{ "name": "docwire", "features": ["llama-engine"] },
"qwen2-7b-instruct-q4-k-m"
]
}
},
"dependencies": [
Expand Down Expand Up @@ -99,18 +125,6 @@
{
"name": "tessdata-fast"
},
{
"name": "ctranslate2"
},
{
"name": "sentencepiece"
},
{
"name": "flan-t5-large-ct2-int8"
},
{
"name": "multilingual-e5-small-ct2-int8"
},
{
"name": "rapidfuzz-cpp"
},
Expand Down
21 changes: 21 additions & 0 deletions ports/qwen2-7b-instruct-q4-k-m/portfile.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
set(MODEL_NAME "qwen2-7b-instruct")
set(MODEL_QUANT "q4_k_m")

set(MODEL_FILE "${MODEL_NAME}-${MODEL_QUANT}.gguf")
Comment thread
reeshabh90 marked this conversation as resolved.

vcpkg_download_distfile(
MODEL_ARCHIVE
URLS "https://huggingface.co/Qwen/Qwen2-7B-Instruct-GGUF/resolve/main/${MODEL_FILE}"
FILENAME "${MODEL_FILE}"
SHA512 39c1f9702856cf5faff13b672033c5c99246b5393550ed58ab9ba0eb2d5ce5d50cc2710b2d9f08d51ad6ce7f6b66826f5c916128fa06c3ac2f78865e167146b8
)

file(INSTALL
${MODEL_ARCHIVE}
DESTINATION ${CURRENT_PACKAGES_DIR}/share/${PORT}
)

file(WRITE
${CURRENT_PACKAGES_DIR}/share/${PORT}/copyright
"Model weights from HuggingFace repository Qwen/Qwen2-7B-Instruct-GGUF."
)
7 changes: 7 additions & 0 deletions ports/qwen2-7b-instruct-q4-k-m/vcpkg.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"name": "qwen2-7b-instruct-q4-k-m",
"version": "1.0.0",
"description": "Qwen2 7B Instruct GGUF model",
"homepage": "https://huggingface.co/Qwen/Qwen2-7B-Instruct-GGUF",
"license": "Apache-2.0"
}
14 changes: 13 additions & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,19 @@ include(ocr.cmake)
include(mail.cmake)
include(archives.cmake)
include(ai.cmake)
include(local_ai.cmake)

if(DOCWIRE_LOCAL_CT2)
include(ai_ct2.cmake)
Comment thread
reeshabh90 marked this conversation as resolved.
endif()

if(DOCWIRE_LLAMA)
include(ai_llama.cmake)
endif()

if(DOCWIRE_LOCAL_CT2 OR DOCWIRE_LLAMA)
include(local_ai.cmake)
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it should be included always because user can bring his own implementation

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

True, but, I will cross verify this on my local build.

endif()

include(fuzzy_match.cmake)
include(content_type.cmake)

Expand Down
16 changes: 5 additions & 11 deletions src/ai.cmake
Original file line number Diff line number Diff line change
@@ -1,13 +1,7 @@
set(EMPTY_SOURCE ${CMAKE_CURRENT_BINARY_DIR}/ai_empty.cpp)
file(GENERATE OUTPUT ${EMPTY_SOURCE} CONTENT "
#include \"ai_elements.h\"
namespace docwire::ai
{
// This dummy function is required to ensure that the shared library is created.
DOCWIRE_AI_EXPORT void dummy_function_for_docwire_ai() {}
}
")
add_library(docwire_ai SHARED ${EMPTY_SOURCE})

add_library(docwire_ai SHARED model_chain_element.cpp ai_summarize.cpp ai_translate.cpp ai_embed.cpp ai_task.cpp)

target_link_libraries(docwire_ai PUBLIC docwire_core)

target_compile_features(docwire_ai PUBLIC cxx_std_20)
if(MSVC)
Expand All @@ -27,4 +21,4 @@ endif()
include(GenerateExportHeader)
generate_export_header(docwire_ai EXPORT_FILE_NAME ai_export.h)
target_include_directories(docwire_ai PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ai_export.h DESTINATION include/docwire)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ai_export.h DESTINATION include/docwire)
51 changes: 51 additions & 0 deletions src/ai_ct2.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@

message(STATUS "DOCWIRE_LOCAL_CT2 enabled: Building CT2 backend.")

add_library(docwire_ai_ct2 SHARED ct2_runner.cpp tokenizer.cpp)

target_compile_definitions(docwire_ai_ct2 PUBLIC DOCWIRE_LOCAL_CT2)

find_package(Boost REQUIRED COMPONENTS filesystem system json)
find_package(ctranslate2 CONFIG REQUIRED)
find_library(sentencepiece_LIBRARIES sentencepiece REQUIRED)

if(MSVC)
find_package(absl CONFIG REQUIRED)
list(APPEND sentencepiece_LIBRARIES
absl::strings
absl::flags
absl::flags_parse
absl::log
absl::check)

find_package(protobuf CONFIG REQUIRED)
list(APPEND sentencepiece_LIBRARIES protobuf::libprotobuf-lite)
endif()

target_link_libraries(docwire_ai_ct2 PRIVATE docwire_core docwire_ai Boost::filesystem Boost::system Boost::json CTranslate2::ctranslate2 ${sentencepiece_LIBRARIES})

docwire_find_resource(FLAN_T5_FULL_PATH REL_PATH "flan-t5-large-ct2-int8" REQUIRED)
docwire_target_resources(docwire_ai_ct2 "flan-t5-large-ct2-int8" SOURCE "${FLAN_T5_FULL_PATH}")

docwire_find_resource(E5_MODEL_FULL_PATH REL_PATH "multilingual-e5-small-ct2-int8" REQUIRED)
docwire_target_resources(docwire_ai_ct2 "multilingual-e5-small-ct2-int8" SOURCE "${E5_MODEL_FULL_PATH}")

if(MSVC)
install(FILES $<TARGET_PDB_FILE:docwire_ai_ct2> DESTINATION bin CONFIGURATIONS Debug)
endif()

include(GenerateExportHeader)

generate_export_header(docwire_ai_ct2 EXPORT_FILE_NAME ai_ct2_export.h)

target_include_directories(docwire_ai_ct2 PUBLIC
$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
$<INSTALL_INTERFACE:include/docwire>
)

install(TARGETS docwire_ai_ct2 EXPORT docwire_targets)

install(FILES
${CMAKE_CURRENT_BINARY_DIR}/ai_ct2_export.h
DESTINATION include/docwire
)
Loading