Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
d2c86e6
feat(java): M5b — bundle integration, classpath loading, CI (#2)
tylern91 Apr 30, 2026
770f8c4
feat(java): M6 — BigramScores.getScore(i,j) with CSR binary search (#3)
tylern91 Apr 30, 2026
6b34b56
fix(bigram): DEFAULT_SCORE 0.0f — C++ uses additive bonus, not penalt…
tylern91 Apr 30, 2026
938429d
feat(java): M7b — Segmenter Viterbi DP core (NORMAL mode, ASCII + sin…
tylern91 Apr 30, 2026
f77e06e
test(segmenter): M7c — verify space-edge trie traversal for multi-syl…
tylern91 Apr 30, 2026
f9a0ccc
feat(segmenter): M7d — sticky syllable Viterbi with SyllableTrie + Bi…
tylern91 Apr 30, 2026
d2c0641
feat(java): M7e — HOST and URL tokenization modes (#8)
tylern91 Apr 30, 2026
7803414
feat(java): M7f — post-hoc token merging rules (#9)
tylern91 Apr 30, 2026
fe1ed0e
feat(java): M7g — Viterbi shouldGo gate + golden-file IT tests (#10)
tylern91 Apr 30, 2026
85d33b8
feat(java): M8 — keepPunct filtering in Segmenter.segment() (#11)
tylern91 Apr 30, 2026
1279680
fix(java): M9 — P0 hardening (bigram leak, rowOffset validation, NaN …
tylern91 Apr 30, 2026
89d93e8
chore(java): M9 — CI & test gap fixes (#13)
tylern91 Apr 30, 2026
74bf1a2
chore(java): M9 code-cleanup batch — dead code, singleton, comments (…
tylern91 Apr 30, 2026
372d490
ci(java): drop redundant push trigger, scope to pull_request (#15)
tylern91 Apr 30, 2026
80ee31a
docs: update README and RELEASE for standalone Java module (#16)
tylern91 Apr 30, 2026
db07599
chore: drop legacy JNI build, add CLAUDE.md, gitignore .serena (#17)
tylern91 Apr 30, 2026
fa7ddca
chore(claude): update gitignore (#18)
tylern91 May 1, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Treat compiled dict binary files as binary — no line-ending conversion or diffing
*.bin binary
45 changes: 45 additions & 0 deletions .github/workflows/java-ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
name: Java CI

on:
pull_request:
types: [opened, synchronize, reopened]
paths:
- "java/**"
- ".github/workflows/java-ci.yml"

defaults:
run:
working-directory: java

permissions:
contents: read

jobs:
build-and-test:
name: Build + unit tests
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v6

- uses: actions/setup-java@v5.2.0
with:
java-version: "21"
distribution: "temurin"
cache: "maven"

- name: Build + all tests
env:
REQUIRE_DICTS: "1"
run: mvn -B verify

- name: Verify dicts JAR contains committed .bin files
run: |
jar tf coccoc-tokenizer-java-dicts/target/coccoc-tokenizer-java-dicts-1.0.0-SNAPSHOT.jar \
| grep -E "\.bin$" | sort
# Assert both structural tries are present
jar tf coccoc-tokenizer-java-dicts/target/coccoc-tokenizer-java-dicts-1.0.0-SNAPSHOT.jar \
| grep -q "com/coccoc/dicts/multiterm.bin"
jar tf coccoc-tokenizer-java-dicts/target/coccoc-tokenizer-java-dicts-1.0.0-SNAPSHOT.jar \
| grep -q "com/coccoc/dicts/syllable.bin"
echo "Dict JAR contents verified."
8 changes: 6 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
build
install
/build
/install
python/*.c
python/*.cpp
python/*.html

.worktrees
.serena/
.claude/
17 changes: 0 additions & 17 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -48,23 +48,6 @@ INSTALL (FILES ${CMAKE_BINARY_DIR}/${MULTITERM_DICT_DUMP} DESTINATION share/toke
INSTALL (FILES ${CMAKE_BINARY_DIR}/${SYLLABLE_DICT_DUMP} DESTINATION share/tokenizer/dicts)
INSTALL (FILES ${CMAKE_BINARY_DIR}/${NONTONE_PAIR_DICT_DUMP} DESTINATION share/tokenizer/dicts)

IF (${BUILD_JAVA})
ADD_CUSTOM_TARGET (compile_java ALL DEPENDS ${CMAKE_BINARY_DIR}/coccoc-tokenizer.jar)
ADD_CUSTOM_COMMAND (
OUTPUT ${CMAKE_BINARY_DIR}/coccoc-tokenizer.jar
COMMAND ${CMAKE_SOURCE_DIR}/java/build_java.sh ${CMAKE_BINARY_DIR}
VERBATIM
)
INSTALL (FILES ${CMAKE_BINARY_DIR}/coccoc-tokenizer.jar DESTINATION share/java)

IF(CMAKE_SYSTEM_NAME STREQUAL Darwin)
INSTALL (FILES ${CMAKE_BINARY_DIR}/libcoccoc_tokenizer_jni.dylib DESTINATION lib)
ELSE ()
INSTALL (FILES ${CMAKE_BINARY_DIR}/libcoccoc_tokenizer_jni.so DESTINATION lib)
ENDIF ()

ENDIF ()

IF (${BUILD_PYTHON})
# XXX Some build files in this target are generated inside source tree, should fix later
ADD_CUSTOM_TARGET (compile_python ALL DEPENDS ${CMAKE_BINARY_DIR}/python/lib)
Expand Down
63 changes: 46 additions & 17 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,7 @@ $ cmake ..
# make install
```

To include java bindings:

```
$ mkdir build && cd build
$ cmake -DBUILD_JAVA=1 ..
# make install
```
For the standalone pure-Java Maven module (no native libraries required), see [Using the Java library](#using-the-java-library).

To include python bindings - install [cython](https://pypi.org/project/Cython/) package and compile wrapper code (only Python3 is supported):

Expand All @@ -37,7 +31,7 @@ $ dpkg-buildpackage <options> # from source tree root
If you want to build and install everything into your sandbox, you can use something like this (it will build everything and install into ~/.local, which is considered as a standard sandbox PREFIX by many applications and frameworks):
```
$ mdkir build && cd build
$ cmake -DBUILD_JAVA=1 -DBUILD_PYTHON=1 -DCMAKE_INSTALL_PREFIX=~/.local ..
$ cmake -DBUILD_PYTHON=1 -DCMAKE_INSTALL_PREFIX=~/.local ..
$ make install
```

Expand Down Expand Up @@ -148,21 +142,56 @@ struct FullToken : Token {

```

## Using Java bindings
## Using the Java library

A standalone pure-Java module is available as a Maven artifact. It requires no native libraries and runs on any platform with Java 21+.

A java interface is provided to be used in java projects. Internally it utilizes JNI and the Unsafe API to connect Java and C++. You can find an example of its usage in `Tokenizer` class's main function:
### Getting the library

Build and install the module to your local Maven repository:

```
java/src/java/Tokenizer.java
$ cd java
$ mvn install -DskipTests
```

To run this test class from source tree, use the following command:
Then add the dependency to your `pom.xml`:

```xml
<dependency>
<groupId>com.coccoc</groupId>
<artifactId>coccoc-tokenizer-java</artifactId>
<version>1.0.0-SNAPSHOT</version>
</dependency>
```
$ LD_LIBRARY_PATH=build java -cp build/coccoc-tokenizer.jar com.coccoc.Tokenizer "một câu văn tiếng Việt"

The companion `coccoc-tokenizer-java-dicts` artifact bundles the dictionary files on the classpath automatically, so no external path configuration is required.

### Usage

```java
import com.coccoc.Tokenizer;
import java.util.ArrayList;

// Load from bundled classpath dicts (recommended)
Tokenizer tokenizer = Tokenizer.getInstance();

// Or load from a custom dict directory on the filesystem
// Tokenizer tokenizer = Tokenizer.getInstance("/path/to/dicts");

// Returns tokens as a list of strings; multi-syllable tokens contain a space
ArrayList<String> tokens = tokenizer.segmentToStringList("Từng bước để trở thành một lập trình viên giỏi");
// [từng, bước, để, trở thành, một, lập trình, viên, giỏi]

// Keep punctuation in the result
tokenizer.segmentKeepPunctsToStringList("xin chào!");
// [xin chào, !]

// URL / host tokenization
tokenizer.segmentUrlToStringList("https://thegioididong.vn");
```

Normally `LD_LIBRARY_PATH` should point to a directory with `libcoccoc_tokenizer_jni.so` binary. If you have already installed deb package or `make install`-ed everything into your system, `LD_LIBRARY_PATH` is not needed as the binary will be taken from your system (`/usr/lib` or similar).
`Tokenizer` is a per-dict-path singleton and is safe to call concurrently from multiple threads.

## Using Python bindings

Expand All @@ -183,7 +212,7 @@ print(T.word_tokenize("xin chào, tôi là người Việt Nam", tokenize_option

## Other languages

Bindings for other languages are not yet implemented but it will be nice if someone can help to write them.
A standalone Java library is available (see above). Bindings for other languages are not yet implemented — contributions are welcome.

## Benchmark

Expand Down Expand Up @@ -231,6 +260,6 @@ We also don't apply any named entity recognition mechanisms within the tokenizer

## Future Plans

We'd love to introduce bindings for Python and maybe other languages later and we'd be happy if somebody can help us doing that. We are also thinking about adding POS tagger and more complex linguistic features later.
We are thinking about adding a POS tagger and more complex linguistic features. Bindings for other languages are welcome contributions.

If you find any issues or have any suggestions regarding further upgrades, please, report them here or write us through github.
If you find any issues or have any suggestions regarding further upgrades, please report them here or reach out through GitHub.
30 changes: 30 additions & 0 deletions RELEASE.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,33 @@
# Java Standalone Module

A pure-Java implementation of the CocCoc Vietnamese tokenizer is now available as a Maven artifact under `java/`. It does not require any native libraries or a C++ build.

## Features

* **Classpath dict loading** — dictionary files are bundled inside the `coccoc-tokenizer-java-dicts` jar and loaded automatically via `Tokenizer.getInstance()`. No external file path needed.
* **Filesystem dict loading** — `Tokenizer.getInstance(String dictPath)` loads `multiterm.bin`, `syllable.bin`, and the optional `bigram.bin` from a directory on disk, matching the behaviour of the C++ library.
* **Full segmentation modes** — `NORMAL`, `HOST`, and `URL` modes match the C++ tokenizer's output.
* **keepPunct filtering** — the `keepPunctuation` flag (and the `segmentKeepPuncts*` convenience methods) mirrors the `-k` option in the CLI tool.
* **Thread safety** — the `Tokenizer` singleton is safe to call concurrently from multiple threads.
* **Java 21+** — built and tested with Temurin 21; no preview features required.

## Maven coordinates

```xml
<dependency>
<groupId>com.coccoc</groupId>
<artifactId>coccoc-tokenizer-java</artifactId>
<version>1.0.0-SNAPSHOT</version>
</dependency>
```

## Notes

* The bundled dict jars (`multiterm.bin` ~19 MB, `syllable.bin` ~20 MB) add ~40 MB to the classpath. `bigram.bin` is optional and improves sticky-phrase segmentation when present.
* The int-constant API (`TOKENIZE_NORMAL`, `TOKENIZE_HOST`, `TOKENIZE_URL`) and the `segment4Transforming` / `segmentKeepPuncts` / `segmentUrl` overloads are provided for source-level compatibility with existing callers of the vendored `Tokenizer` class used in `elasticsearch-analysis-vietnamese`.

---

# Release 1.5

## Major Features and Improvement
Expand Down
2 changes: 1 addition & 1 deletion debian/rules
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ export DH_OPTIONS
dh $@

override_dh_auto_configure:
dh_auto_configure -- -DBUILD_JAVA=1 -DBUILD_PYTHON=1
dh_auto_configure -- -DBUILD_PYTHON=1

override_dh_strip:

58 changes: 58 additions & 0 deletions java/.claude/.claude/tdd-guard/data/instructions.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
## TDD Fundamentals

### The TDD Cycle
The foundation of TDD is the Red-Green-Refactor cycle:

1. **Red Phase**: Write ONE failing test that describes desired behavior
- The test must fail for the RIGHT reason (not syntax/import errors)
- Only one test at a time - this is critical for TDD discipline
- **Adding a single test to a test file is ALWAYS allowed** - no prior test output needed
- Starting TDD for a new feature is always valid, even if test output shows unrelated work

2. **Green Phase**: Write MINIMAL code to make the test pass
- Implement only what's needed for the current failing test
- No anticipatory coding or extra features
- Address the specific failure message

3. **Refactor Phase**: Improve code structure while keeping tests green
- Only allowed when relevant tests are passing
- Requires proof that tests have been run and are green
- Applies to BOTH implementation code and behavioral changes in test code (what assertions check)
- No refactoring with failing tests - fix them first

### Core Violations

1. **Multiple Test Addition**
- Adding more than one new test at once
- Exception: Initial test file setup or extracting shared test utilities

2. **Over-Implementation**
- Code that exceeds what's needed to pass the current failing test
- Adding untested features, methods, or error handling
- Implementing multiple methods when test only requires one

3. **Premature Implementation**
- Adding implementation before a test exists and fails properly
- Adding implementation without running the test first
- Behavioral refactoring when tests haven't been run or are failing

### Critical Principle: Incremental Development
Each step in TDD should address ONE specific issue:
- Test can't locate the impl (import/symbol unresolved) → Create empty stub only
- Test errors calling the impl (signature or call mismatch) → Adjust signature, stub body minimally
- Test fails on assertion (expected vs received) → Implement minimal logic only

### Reaching a Clean Red
Before a failing test becomes a useful Red, it has to run far enough to evaluate an assertion. Some failures happen before that point:
- The reporter shows no tests ran — the test file couldn't load (missing import, unresolved symbol).
- A test errored before its assertion — the impl's signature doesn't match the call, or the call threw mid-execution.

In both cases, the agent may adjust the impl: create missing stubs, change the signature to accept the test's call, or replace the body with a minimal form (empty, constant return, unchanged body with new params). This is part of reaching Red, not Refactoring.
No new logic is permitted at this step. Ask the agent if they forgot to stub.

### General Information
- In the refactor phase, it is perfectly fine to refactor both test and implementation code. That said, completely new functionality is not allowed. Types, clean up, abstractions, and helpers are allowed as long as they do not introduce new behavior.
- When a test-file diff restructures existing tests (new names, reordered, combined, split) and the intent isn't clearly "add many new tests," default to approval. The one-new-test rule is about intent to add behavior, not surface diff count.
- During refactor (tests green), adding types, interfaces, or constant literals to an existing or new file is always allowed — they add no runtime behavior by construction.
- During refactor (tests green), extracting helpers or functions whose behavior already lives elsewhere (covered by existing tests) into an existing or new file is also allowed. A function whose behavior appears nowhere else is net-new, not extraction, and requires a failing test first.
- Provide the agent with helpful directions so that they do not get stuck when blocking them.
1 change: 1 addition & 0 deletions java/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
target/
35 changes: 0 additions & 35 deletions java/build_java.sh

This file was deleted.

Loading