diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..d3e70b4 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +# Treat compiled dict binary files as binary — no line-ending conversion or diffing +*.bin binary diff --git a/.github/workflows/java-ci.yml b/.github/workflows/java-ci.yml new file mode 100644 index 0000000..8981e4e --- /dev/null +++ b/.github/workflows/java-ci.yml @@ -0,0 +1,45 @@ +name: Java CI + +on: + pull_request: + types: [opened, synchronize, reopened] + paths: + - "java/**" + - ".github/workflows/java-ci.yml" + +defaults: + run: + working-directory: java + +permissions: + contents: read + +jobs: + build-and-test: + name: Build + unit tests + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v6 + + - uses: actions/setup-java@v5.2.0 + with: + java-version: "21" + distribution: "temurin" + cache: "maven" + + - name: Build + all tests + env: + REQUIRE_DICTS: "1" + run: mvn -B verify + + - name: Verify dicts JAR contains committed .bin files + run: | + jar tf coccoc-tokenizer-java-dicts/target/coccoc-tokenizer-java-dicts-1.0.0-SNAPSHOT.jar \ + | grep -E "\.bin$" | sort + # Assert both structural tries are present + jar tf coccoc-tokenizer-java-dicts/target/coccoc-tokenizer-java-dicts-1.0.0-SNAPSHOT.jar \ + | grep -q "com/coccoc/dicts/multiterm.bin" + jar tf coccoc-tokenizer-java-dicts/target/coccoc-tokenizer-java-dicts-1.0.0-SNAPSHOT.jar \ + | grep -q "com/coccoc/dicts/syllable.bin" + echo "Dict JAR contents verified." diff --git a/.gitignore b/.gitignore index 27b9e69..ff2d4f0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,9 @@ -build -install +/build +/install python/*.c python/*.cpp python/*.html + +.worktrees +.serena/ +.claude/ diff --git a/CMakeLists.txt b/CMakeLists.txt index 7feaeee..2a171e4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -48,23 +48,6 @@ INSTALL (FILES ${CMAKE_BINARY_DIR}/${MULTITERM_DICT_DUMP} DESTINATION share/toke INSTALL (FILES ${CMAKE_BINARY_DIR}/${SYLLABLE_DICT_DUMP} DESTINATION share/tokenizer/dicts) INSTALL (FILES ${CMAKE_BINARY_DIR}/${NONTONE_PAIR_DICT_DUMP} DESTINATION share/tokenizer/dicts) -IF (${BUILD_JAVA}) - ADD_CUSTOM_TARGET (compile_java ALL DEPENDS ${CMAKE_BINARY_DIR}/coccoc-tokenizer.jar) - ADD_CUSTOM_COMMAND ( - OUTPUT ${CMAKE_BINARY_DIR}/coccoc-tokenizer.jar - COMMAND ${CMAKE_SOURCE_DIR}/java/build_java.sh ${CMAKE_BINARY_DIR} - VERBATIM - ) - INSTALL (FILES ${CMAKE_BINARY_DIR}/coccoc-tokenizer.jar DESTINATION share/java) - - IF(CMAKE_SYSTEM_NAME STREQUAL Darwin) - INSTALL (FILES ${CMAKE_BINARY_DIR}/libcoccoc_tokenizer_jni.dylib DESTINATION lib) - ELSE () - INSTALL (FILES ${CMAKE_BINARY_DIR}/libcoccoc_tokenizer_jni.so DESTINATION lib) - ENDIF () - -ENDIF () - IF (${BUILD_PYTHON}) # XXX Some build files in this target are generated inside source tree, should fix later ADD_CUSTOM_TARGET (compile_python ALL DEPENDS ${CMAKE_BINARY_DIR}/python/lib) diff --git a/README.md b/README.md index 68c401a..1588eff 100644 --- a/README.md +++ b/README.md @@ -12,13 +12,7 @@ $ cmake .. # make install ``` -To include java bindings: - -``` -$ mkdir build && cd build -$ cmake -DBUILD_JAVA=1 .. -# make install -``` +For the standalone pure-Java Maven module (no native libraries required), see [Using the Java library](#using-the-java-library). To include python bindings - install [cython](https://pypi.org/project/Cython/) package and compile wrapper code (only Python3 is supported): @@ -37,7 +31,7 @@ $ dpkg-buildpackage # from source tree root If you want to build and install everything into your sandbox, you can use something like this (it will build everything and install into ~/.local, which is considered as a standard sandbox PREFIX by many applications and frameworks): ``` $ mdkir build && cd build -$ cmake -DBUILD_JAVA=1 -DBUILD_PYTHON=1 -DCMAKE_INSTALL_PREFIX=~/.local .. +$ cmake -DBUILD_PYTHON=1 -DCMAKE_INSTALL_PREFIX=~/.local .. $ make install ``` @@ -148,21 +142,56 @@ struct FullToken : Token { ``` -## Using Java bindings +## Using the Java library + +A standalone pure-Java module is available as a Maven artifact. It requires no native libraries and runs on any platform with Java 21+. -A java interface is provided to be used in java projects. Internally it utilizes JNI and the Unsafe API to connect Java and C++. You can find an example of its usage in `Tokenizer` class's main function: +### Getting the library + +Build and install the module to your local Maven repository: ``` -java/src/java/Tokenizer.java +$ cd java +$ mvn install -DskipTests ``` -To run this test class from source tree, use the following command: +Then add the dependency to your `pom.xml`: +```xml + + com.coccoc + coccoc-tokenizer-java + 1.0.0-SNAPSHOT + ``` -$ LD_LIBRARY_PATH=build java -cp build/coccoc-tokenizer.jar com.coccoc.Tokenizer "một câu văn tiếng Việt" + +The companion `coccoc-tokenizer-java-dicts` artifact bundles the dictionary files on the classpath automatically, so no external path configuration is required. + +### Usage + +```java +import com.coccoc.Tokenizer; +import java.util.ArrayList; + +// Load from bundled classpath dicts (recommended) +Tokenizer tokenizer = Tokenizer.getInstance(); + +// Or load from a custom dict directory on the filesystem +// Tokenizer tokenizer = Tokenizer.getInstance("/path/to/dicts"); + +// Returns tokens as a list of strings; multi-syllable tokens contain a space +ArrayList tokens = tokenizer.segmentToStringList("Từng bước để trở thành một lập trình viên giỏi"); +// [từng, bước, để, trở thành, một, lập trình, viên, giỏi] + +// Keep punctuation in the result +tokenizer.segmentKeepPunctsToStringList("xin chào!"); +// [xin chào, !] + +// URL / host tokenization +tokenizer.segmentUrlToStringList("https://thegioididong.vn"); ``` -Normally `LD_LIBRARY_PATH` should point to a directory with `libcoccoc_tokenizer_jni.so` binary. If you have already installed deb package or `make install`-ed everything into your system, `LD_LIBRARY_PATH` is not needed as the binary will be taken from your system (`/usr/lib` or similar). +`Tokenizer` is a per-dict-path singleton and is safe to call concurrently from multiple threads. ## Using Python bindings @@ -183,7 +212,7 @@ print(T.word_tokenize("xin chào, tôi là người Việt Nam", tokenize_option ## Other languages -Bindings for other languages are not yet implemented but it will be nice if someone can help to write them. +A standalone Java library is available (see above). Bindings for other languages are not yet implemented — contributions are welcome. ## Benchmark @@ -231,6 +260,6 @@ We also don't apply any named entity recognition mechanisms within the tokenizer ## Future Plans -We'd love to introduce bindings for Python and maybe other languages later and we'd be happy if somebody can help us doing that. We are also thinking about adding POS tagger and more complex linguistic features later. +We are thinking about adding a POS tagger and more complex linguistic features. Bindings for other languages are welcome contributions. -If you find any issues or have any suggestions regarding further upgrades, please, report them here or write us through github. +If you find any issues or have any suggestions regarding further upgrades, please report them here or reach out through GitHub. diff --git a/RELEASE.md b/RELEASE.md index cf582d3..add9198 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,3 +1,33 @@ +# Java Standalone Module + +A pure-Java implementation of the CocCoc Vietnamese tokenizer is now available as a Maven artifact under `java/`. It does not require any native libraries or a C++ build. + +## Features + +* **Classpath dict loading** — dictionary files are bundled inside the `coccoc-tokenizer-java-dicts` jar and loaded automatically via `Tokenizer.getInstance()`. No external file path needed. +* **Filesystem dict loading** — `Tokenizer.getInstance(String dictPath)` loads `multiterm.bin`, `syllable.bin`, and the optional `bigram.bin` from a directory on disk, matching the behaviour of the C++ library. +* **Full segmentation modes** — `NORMAL`, `HOST`, and `URL` modes match the C++ tokenizer's output. +* **keepPunct filtering** — the `keepPunctuation` flag (and the `segmentKeepPuncts*` convenience methods) mirrors the `-k` option in the CLI tool. +* **Thread safety** — the `Tokenizer` singleton is safe to call concurrently from multiple threads. +* **Java 21+** — built and tested with Temurin 21; no preview features required. + +## Maven coordinates + +```xml + + com.coccoc + coccoc-tokenizer-java + 1.0.0-SNAPSHOT + +``` + +## Notes + +* The bundled dict jars (`multiterm.bin` ~19 MB, `syllable.bin` ~20 MB) add ~40 MB to the classpath. `bigram.bin` is optional and improves sticky-phrase segmentation when present. +* The int-constant API (`TOKENIZE_NORMAL`, `TOKENIZE_HOST`, `TOKENIZE_URL`) and the `segment4Transforming` / `segmentKeepPuncts` / `segmentUrl` overloads are provided for source-level compatibility with existing callers of the vendored `Tokenizer` class used in `elasticsearch-analysis-vietnamese`. + +--- + # Release 1.5 ## Major Features and Improvement diff --git a/debian/rules b/debian/rules index 8d7267a..18f8d2d 100755 --- a/debian/rules +++ b/debian/rules @@ -20,7 +20,7 @@ export DH_OPTIONS dh $@ override_dh_auto_configure: - dh_auto_configure -- -DBUILD_JAVA=1 -DBUILD_PYTHON=1 + dh_auto_configure -- -DBUILD_PYTHON=1 override_dh_strip: diff --git a/java/.claude/.claude/tdd-guard/data/instructions.md b/java/.claude/.claude/tdd-guard/data/instructions.md new file mode 100644 index 0000000..22e7bc5 --- /dev/null +++ b/java/.claude/.claude/tdd-guard/data/instructions.md @@ -0,0 +1,58 @@ +## TDD Fundamentals + +### The TDD Cycle +The foundation of TDD is the Red-Green-Refactor cycle: + +1. **Red Phase**: Write ONE failing test that describes desired behavior + - The test must fail for the RIGHT reason (not syntax/import errors) + - Only one test at a time - this is critical for TDD discipline + - **Adding a single test to a test file is ALWAYS allowed** - no prior test output needed + - Starting TDD for a new feature is always valid, even if test output shows unrelated work + +2. **Green Phase**: Write MINIMAL code to make the test pass + - Implement only what's needed for the current failing test + - No anticipatory coding or extra features + - Address the specific failure message + +3. **Refactor Phase**: Improve code structure while keeping tests green + - Only allowed when relevant tests are passing + - Requires proof that tests have been run and are green + - Applies to BOTH implementation code and behavioral changes in test code (what assertions check) + - No refactoring with failing tests - fix them first + +### Core Violations + +1. **Multiple Test Addition** + - Adding more than one new test at once + - Exception: Initial test file setup or extracting shared test utilities + +2. **Over-Implementation** + - Code that exceeds what's needed to pass the current failing test + - Adding untested features, methods, or error handling + - Implementing multiple methods when test only requires one + +3. **Premature Implementation** + - Adding implementation before a test exists and fails properly + - Adding implementation without running the test first + - Behavioral refactoring when tests haven't been run or are failing + +### Critical Principle: Incremental Development +Each step in TDD should address ONE specific issue: +- Test can't locate the impl (import/symbol unresolved) → Create empty stub only +- Test errors calling the impl (signature or call mismatch) → Adjust signature, stub body minimally +- Test fails on assertion (expected vs received) → Implement minimal logic only + +### Reaching a Clean Red +Before a failing test becomes a useful Red, it has to run far enough to evaluate an assertion. Some failures happen before that point: +- The reporter shows no tests ran — the test file couldn't load (missing import, unresolved symbol). +- A test errored before its assertion — the impl's signature doesn't match the call, or the call threw mid-execution. + +In both cases, the agent may adjust the impl: create missing stubs, change the signature to accept the test's call, or replace the body with a minimal form (empty, constant return, unchanged body with new params). This is part of reaching Red, not Refactoring. +No new logic is permitted at this step. Ask the agent if they forgot to stub. + +### General Information +- In the refactor phase, it is perfectly fine to refactor both test and implementation code. That said, completely new functionality is not allowed. Types, clean up, abstractions, and helpers are allowed as long as they do not introduce new behavior. +- When a test-file diff restructures existing tests (new names, reordered, combined, split) and the intent isn't clearly "add many new tests," default to approval. The one-new-test rule is about intent to add behavior, not surface diff count. +- During refactor (tests green), adding types, interfaces, or constant literals to an existing or new file is always allowed — they add no runtime behavior by construction. +- During refactor (tests green), extracting helpers or functions whose behavior already lives elsewhere (covered by existing tests) into an existing or new file is also allowed. A function whose behavior appears nowhere else is net-new, not extraction, and requires a failing test first. +- Provide the agent with helpful directions so that they do not get stuck when blocking them. diff --git a/java/.gitignore b/java/.gitignore new file mode 100644 index 0000000..2f7896d --- /dev/null +++ b/java/.gitignore @@ -0,0 +1 @@ +target/ diff --git a/java/build_java.sh b/java/build_java.sh deleted file mode 100755 index 271a9b3..0000000 --- a/java/build_java.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/bin/sh - -if [ -z "$1" ]; then - echo >&2 "Usage: $0 " - exit 1 -fi - - -SOURCE_DIR=`dirname $0` -BUILD_DIR="$1" - -OS="$(uname | tr '[:upper:]' '[:lower:]')" -if [ "${OS}" = "darwin" ]; then - # macOS - JAVA_HOME="$(/usr/libexec/java_home)" - OUTPUT_FILE='libcoccoc_tokenizer_jni.dylib' - else - JAVA_HOME="$(dirname $(dirname $(readlink -f $(which javac))))" - OUTPUT_FILE='libcoccoc_tokenizer_jni.so' - fi - - -mkdir -p ${BUILD_DIR}/java -${JAVA_HOME}/bin/javac -h ${BUILD_DIR}/java -d ${BUILD_DIR}/java ${SOURCE_DIR}/src/java/*.java - -g++ -shared -Wall -Werror -std=c++11 -Wno-deprecated -O3 -DNDEBUG -ggdb -fPIC \ - -I ${SOURCE_DIR}/.. \ - -I ${BUILD_DIR}/auto \ - -I ${BUILD_DIR}/java \ - -I ${JAVA_HOME}/include \ - -I ${JAVA_HOME}/include/${OS} \ - -o ${BUILD_DIR}/${OUTPUT_FILE} \ - ${SOURCE_DIR}/src/jni/Tokenizer.cpp - -jar -cf ${BUILD_DIR}/coccoc-tokenizer.jar -C ${BUILD_DIR}/java . diff --git a/java/coccoc-tokenizer-java-dicts/pom.xml b/java/coccoc-tokenizer-java-dicts/pom.xml new file mode 100644 index 0000000..9b9518f --- /dev/null +++ b/java/coccoc-tokenizer-java-dicts/pom.xml @@ -0,0 +1,95 @@ + + + 4.0.0 + + + com.coccoc + coccoc-tokenizer-java-parent + 1.0.0-SNAPSHOT + ../pom.xml + + + coccoc-tokenizer-java-dicts + jar + + coccoc-tokenizer-java-dicts + Compiled dictionary data for the CocCoc Vietnamese tokenizer + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + + + default-compile + none + + + default-testCompile + none + + + + + org.apache.maven.plugins + maven-surefire-plugin + + + default-test + none + + + + + + + + + + regenerate-dicts + + + + org.codehaus.mojo + exec-maven-plugin + + + compile-dicts + generate-resources + + exec + + + java + + -cp + + ${settings.localRepository}/com/coccoc/coccoc-tokenizer-java/${project.version}/coccoc-tokenizer-java-${project.version}.jar + com.coccoc.tools.DictCompile + ${dicts.source.dir} + ${project.basedir}/src/main/resources/com/coccoc/dicts + + + + + + + + + + diff --git a/java/coccoc-tokenizer-java-dicts/src/main/resources/com/coccoc/dicts/multiterm.bin b/java/coccoc-tokenizer-java-dicts/src/main/resources/com/coccoc/dicts/multiterm.bin new file mode 100644 index 0000000..0fc680f Binary files /dev/null and b/java/coccoc-tokenizer-java-dicts/src/main/resources/com/coccoc/dicts/multiterm.bin differ diff --git a/java/coccoc-tokenizer-java-dicts/src/main/resources/com/coccoc/dicts/syllable.bin b/java/coccoc-tokenizer-java-dicts/src/main/resources/com/coccoc/dicts/syllable.bin new file mode 100644 index 0000000..332a230 Binary files /dev/null and b/java/coccoc-tokenizer-java-dicts/src/main/resources/com/coccoc/dicts/syllable.bin differ diff --git a/java/coccoc-tokenizer-java/pom.xml b/java/coccoc-tokenizer-java/pom.xml new file mode 100644 index 0000000..4d7587a --- /dev/null +++ b/java/coccoc-tokenizer-java/pom.xml @@ -0,0 +1,64 @@ + + + 4.0.0 + + + com.coccoc + coccoc-tokenizer-java-parent + 1.0.0-SNAPSHOT + ../pom.xml + + + coccoc-tokenizer-java + jar + + coccoc-tokenizer-java + Pure-Java Vietnamese tokenizer engine (code artifact) + + + + org.junit.jupiter + junit-jupiter + test + + + + com.coccoc + coccoc-tokenizer-java-dicts + ${project.version} + test + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + + ${maven.compiler.release} + + + + org.apache.maven.plugins + maven-surefire-plugin + + + org.apache.maven.plugins + maven-failsafe-plugin + + + + integration-test + verify + + + + + + + diff --git a/java/coccoc-tokenizer-java/src/main/java/com/coccoc/Token.java b/java/coccoc-tokenizer-java/src/main/java/com/coccoc/Token.java new file mode 100644 index 0000000..d8e33d1 --- /dev/null +++ b/java/coccoc-tokenizer-java/src/main/java/com/coccoc/Token.java @@ -0,0 +1,133 @@ +package com.coccoc; + +import java.util.ArrayList; +import java.util.List; + +/** + * Immutable token produced by {@link Tokenizer#segment}. + * + * Type and SegType ordinals must match token.hpp lines 11-20 so that any + * future JNI bridge or serialization layer sees the same integer values. + */ +public final class Token implements Cloneable { + + public static final Token FULL_STOP = new Token(".", Type.PUNCT, SegType.END_SEG_TYPE, -1, -1); + public static final Token COMMA = new Token(",", Type.PUNCT, SegType.END_SEG_TYPE, -1, -1); + public static final Token SPACE = new Token(" ", Type.SPACE, null, -1, -1); + + public enum Type { + WORD, + NUMBER, + SPACE, + PUNCT, + WHOLE_URL, + SITE_URL; + + private static final Type[] VALUES = values(); + + public static Type fromInt(int i) { + return VALUES[i]; + } + } + + public enum SegType { + OTHER_SEG_TYPE, + SKIP_SEG_TYPE, + URL_SEG_TYPE, + END_URL_TYPE, + END_SEG_TYPE; + + private static final SegType[] VALUES = values(); + + public static SegType fromInt(int i) { + return VALUES[i]; + } + } + + private final String text; + private final Type type; + private SegType segType; + private boolean splittedByDot; + private final int startPos; + private final int endPos; + + public Token(String text, int start, int end) { + this(text, Type.WORD, null, start, end); + } + + public Token(String text, Type type, int start, int end) { + this(text, type, null, start, end); + } + + public Token(String text, Type type, SegType segType, int start, int end) { + this(text, type, segType, false, start, end); + } + + public Token(String text, Type type, SegType segType, boolean splittedByDot, int start, int end) { + this.text = text; + this.type = type; + this.segType = segType; + this.splittedByDot = splittedByDot; + this.startPos = start; + this.endPos = end > 0 ? end : (start >= 0 ? start + text.length() : start); + } + + public String getText() { return text; } + public Type getType() { return type; } + public int getPos() { return startPos; } + public int getEndPos() { return endPos; } + public SegType getSegType() { return segType; } + public boolean isSplittedByDot() { return splittedByDot; } + + public boolean isWord() { return type == Type.WORD; } + public boolean isPunct() { return type == Type.PUNCT; } + public boolean isNumber() { return type == Type.NUMBER; } + public boolean isWholeUrl() { return type == Type.WHOLE_URL; } + public boolean isSiteUrl() { return type == Type.SITE_URL; } + public boolean isSpace() { return type == Type.SPACE; } + public boolean isWordOrNumber() { return isWord() || isNumber() || isSiteUrl(); } + + public boolean isEndSeg() { return segType == SegType.END_SEG_TYPE; } + public boolean isUrlSeg() { return segType == SegType.URL_SEG_TYPE; } + public boolean isEndUrlSeg() { return segType == SegType.END_URL_TYPE; } + public boolean isSkipSeg() { return segType == SegType.SKIP_SEG_TYPE; } + public boolean isOtherSeg() { return segType == SegType.OTHER_SEG_TYPE; } + + public void setEndSeg() { segType = SegType.END_SEG_TYPE; } + public void setOtherSeg() { segType = SegType.OTHER_SEG_TYPE; } + public void setEndUrlSeg() { segType = SegType.END_URL_TYPE; } + public void setUrlSeg() { segType = SegType.URL_SEG_TYPE; } + public void setSkipSeg() { segType = SegType.SKIP_SEG_TYPE; } + + public Token cloneWithNewText(String newText, int newEnd) { + return new Token(newText, type, segType, splittedByDot, startPos, newEnd); + } + + public static ArrayList toStringList(List tokens) { + ArrayList out = new ArrayList<>(tokens.size()); + for (Token t : tokens) out.add(t.getText()); + return out; + } + + @Override + public Token clone() { + return new Token(text, type, segType, splittedByDot, startPos, endPos); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) return true; + if (!(obj instanceof Token that)) return false; + return text.equals(that.text) && type == that.type; + } + + @Override + public int hashCode() { + return text.hashCode() ^ type.hashCode(); + } + + @Override + public String toString() { + return type + " `" + text + "` " + startPos + '-' + endPos; + } +} diff --git a/java/coccoc-tokenizer-java/src/main/java/com/coccoc/TokenizeOption.java b/java/coccoc-tokenizer-java/src/main/java/com/coccoc/TokenizeOption.java new file mode 100644 index 0000000..e7d3769 --- /dev/null +++ b/java/coccoc-tokenizer-java/src/main/java/com/coccoc/TokenizeOption.java @@ -0,0 +1,24 @@ +package com.coccoc; + +/** + * Tokenization mode — ordinals must match tokenizer/tokenizer.hpp lines 20-22. + * + * NORMAL(0): standard word segmentation + * HOST(1): dot-split hostname tokenization + * URL(2): full URL tokenization (sticky syllable + host) + */ +public enum TokenizeOption { + NORMAL(0), + HOST(1), + URL(2); + + private final int value; + + TokenizeOption(int value) { + this.value = value; + } + + public int value() { + return value; + } +} diff --git a/java/coccoc-tokenizer-java/src/main/java/com/coccoc/Tokenizer.java b/java/coccoc-tokenizer-java/src/main/java/com/coccoc/Tokenizer.java new file mode 100644 index 0000000..318f824 --- /dev/null +++ b/java/coccoc-tokenizer-java/src/main/java/com/coccoc/Tokenizer.java @@ -0,0 +1,181 @@ +package com.coccoc; + +import com.coccoc.internal.bigram.BigramScores; +import com.coccoc.internal.io.DictReader; +import com.coccoc.internal.trie.MultitermTrie; +import com.coccoc.internal.segment.Segmenter; +import com.coccoc.internal.lang.VnLangTool; +import com.coccoc.internal.trie.SyllableTrie; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; + +/** + * Public facade for the CocCoc Vietnamese tokenizer. + * + * API is binary-compatible with the vendored Tokenizer in + * elasticsearch-analysis-vietnamese so the plugin can swap to this Maven + * artifact without changing any call sites. + * + * Lifecycle: singleton per dict-path, lazily initialized via getInstance(). + */ +public class Tokenizer { + + // Integer constants kept for source-level back-compat with old callers. + public static final int TOKENIZE_NORMAL = 0; + public static final int TOKENIZE_HOST = 1; + public static final int TOKENIZE_URL = 2; + + private static final String CLASSPATH_DICTS = "com/coccoc/dicts"; + // Sentinel used as initializedDictPath when loaded from classpath. + private static final String CLASSPATH_DICT_PATH = "classpath:" + CLASSPATH_DICTS; + + // Both fields are only ever read/written inside synchronized getInstance() methods. + private static Tokenizer instance; + private static String initializedDictPath; + + private final MultitermTrie multitermTrie; + private final SyllableTrie syllableTrie; + // Nullable: bigram.bin may not be bundled in the classpath dicts JAR. + private final BigramScores bigramScores; + private final String dictPath; + private final Segmenter segmenter; + + // ----------------------------------------------------------------------- + // Singleton factories + // ----------------------------------------------------------------------- + + /** + * Load dict files from the bundled classpath resources under + * {@code com/coccoc/dicts/} (packaged in {@code coccoc-tokenizer-java-dicts}). + * bigram.bin is optional — absent if the dicts JAR omits it. + */ + public static synchronized Tokenizer getInstance() throws IOException { + if (instance == null) { + instance = new Tokenizer(); + initializedDictPath = CLASSPATH_DICT_PATH; + } else if (!CLASSPATH_DICT_PATH.equals(initializedDictPath)) { + throw new IllegalStateException( + "Tokenizer already initialized with dictPath=" + initializedDictPath); + } + return instance; + } + + /** + * Load dict files from the given filesystem directory. + * bigram.bin is optional — loaded only if the file exists in the directory. + */ + public static synchronized Tokenizer getInstance(String dictPath) throws IOException { + if (instance == null) { + instance = new Tokenizer(dictPath); + initializedDictPath = dictPath; + } else if (!initializedDictPath.equals(dictPath)) { + throw new IllegalStateException( + "Tokenizer already initialized with dictPath=" + initializedDictPath); + } + return instance; + } + + // ----------------------------------------------------------------------- + // Constructors + // ----------------------------------------------------------------------- + + /** Load from classpath resources (called by no-arg getInstance()). */ + private Tokenizer() throws IOException { + ClassLoader cl = Tokenizer.class.getClassLoader(); + try (InputStream mt = requireResource(cl, "multiterm.bin"); + InputStream sy = requireResource(cl, "syllable.bin")) { + this.multitermTrie = DictReader.readMultiterm(mt, "multiterm.bin"); + this.syllableTrie = DictReader.readSyllable(sy, "syllable.bin"); + } + try (InputStream bigramIn = cl.getResourceAsStream(CLASSPATH_DICTS + "/bigram.bin")) { + this.bigramScores = bigramIn != null ? DictReader.readBigram(bigramIn, "bigram.bin") : null; + } + VnLangTool.initSimple(); + this.segmenter = new Segmenter(this.multitermTrie); + this.dictPath = CLASSPATH_DICT_PATH; + } + + /** Load from a filesystem directory (called by getInstance(String)). */ + Tokenizer(String dictPath) throws IOException { + Path dir = Path.of(dictPath); + this.multitermTrie = DictReader.readMultiterm(dir.resolve("multiterm.bin")); + this.syllableTrie = DictReader.readSyllable(dir.resolve("syllable.bin")); + Path bigramPath = dir.resolve("bigram.bin"); + this.bigramScores = Files.exists(bigramPath) ? DictReader.readBigram(bigramPath) : null; + this.dictPath = dictPath; + VnLangTool.initSimple(); + this.segmenter = new Segmenter(this.multitermTrie); + } + + private static InputStream requireResource(ClassLoader cl, String name) throws IOException { + InputStream in = cl.getResourceAsStream(CLASSPATH_DICTS + "/" + name); + if (in == null) + throw new IOException("missing classpath resource: " + CLASSPATH_DICTS + "/" + name); + return in; + } + + // ----------------------------------------------------------------------- + // Primary segment API (enum-based — used by the ES plugin) + // ----------------------------------------------------------------------- + + public List segment(String text, TokenizeOption option, boolean keepPunctuation) { + return segmenter.segment(text, option, keepPunctuation); + } + + // ----------------------------------------------------------------------- + // Convenience overloads (int-based — back-compat with original module) + // ----------------------------------------------------------------------- + + public ArrayList segment(String text, boolean forTransforming, int tokenizeOption, boolean keepPuncts) { + return new ArrayList<>(segmenter.segment(text, TokenizeOption.values()[tokenizeOption], keepPuncts)); + } + + public ArrayList segment(String text, boolean forTransforming, int tokenizeOption) { + return segment(text, forTransforming, tokenizeOption, forTransforming); + } + + public ArrayList segment(String text, int tokenizeOption) { + return segment(text, false, tokenizeOption); + } + + public ArrayList segment(String text, boolean forTransforming) { + return segment(text, forTransforming, TOKENIZE_NORMAL); + } + + public ArrayList segment(String text) { + return segment(text, false); + } + + public ArrayList segmentToStringList(String text) { + return Token.toStringList(segment(text, false)); + } + + public ArrayList segmentKeepPuncts(String text) { + return segment(text, false, TOKENIZE_NORMAL, true); + } + + public ArrayList segmentKeepPunctsToStringList(String text) { + return Token.toStringList(segmentKeepPuncts(text)); + } + + public ArrayList segmentUrl(String text) { + return segment(text, false, TOKENIZE_URL); + } + + public ArrayList segmentUrlToStringList(String text) { + return Token.toStringList(segmentUrl(text)); + } + + public ArrayList segment4Transforming(String text) { + return segment(text, true, TOKENIZE_NORMAL); + } + + public ArrayList segment4Transforming(String text, int tokenizeOption) { + return segment(text, true, tokenizeOption); + } +} diff --git a/java/coccoc-tokenizer-java/src/main/java/com/coccoc/internal/bigram/BigramScores.java b/java/coccoc-tokenizer-java/src/main/java/com/coccoc/internal/bigram/BigramScores.java new file mode 100644 index 0000000..0c31404 --- /dev/null +++ b/java/coccoc-tokenizer-java/src/main/java/com/coccoc/internal/bigram/BigramScores.java @@ -0,0 +1,33 @@ +package com.coccoc.internal.bigram; + +import java.util.Arrays; + +/** + * CSR-format bigram frequency scores loaded from bigram.bin. + * Populated by DictReader. + */ +public final class BigramScores { + public static final float DEFAULT_SCORE = 0.0f; + + private final int[] rowOffset; + private final int[] colIndex; + private final float[] value; + + public BigramScores(int[] rowOffset, int[] colIndex, float[] value) { + this.rowOffset = rowOffset; + this.colIndex = colIndex; + this.value = value; + } + + public float getScore(int i, int j) { + int start = rowOffset[i]; + int end = rowOffset[i + 1]; + int pos = Arrays.binarySearch(colIndex, start, end, j); + return pos >= 0 ? value[pos] : DEFAULT_SCORE; + } + + public int[] rowOffsets() { return Arrays.copyOf(rowOffset, rowOffset.length); } + public int[] colIndex() { return Arrays.copyOf(colIndex, colIndex.length); } + public float[] values() { return Arrays.copyOf(value, value.length); } + public int rowCount() { return rowOffset.length - 1; } +} diff --git a/java/coccoc-tokenizer-java/src/main/java/com/coccoc/internal/build/SyllablePacker.java b/java/coccoc-tokenizer-java/src/main/java/com/coccoc/internal/build/SyllablePacker.java new file mode 100644 index 0000000..b9bd378 --- /dev/null +++ b/java/coccoc-tokenizer-java/src/main/java/com/coccoc/internal/build/SyllablePacker.java @@ -0,0 +1,65 @@ +package com.coccoc.internal.build; + +import com.coccoc.internal.trie.SyllableTrie; +import java.util.*; + +/** + * Packs a hash-based syllable trie into a SyllableTrie (DA-trie with index[] array). + * The index[] array starts all-zero; DictCompile sets it after reading Freq2NontoneUniFile. + */ +public final class SyllablePacker { + + private SyllablePacker() {} + + public static SyllableTrie packFromPool(List pool) { + // Build alphabet (same as TriePacker) + TreeSet alphabetSet = new TreeSet<>(); + for (TriePacker.HashNode node : pool) alphabetSet.addAll(node.children.keySet()); + + int[] charMap; + if (alphabetSet.isEmpty()) { + charMap = new int[0]; + } else { + int maxCp = alphabetSet.last(); + charMap = new int[maxCp + 1]; + Arrays.fill(charMap, -1); + int idx = 0; + for (int cp : alphabetSet) charMap[cp] = idx++; + } + int alphabetSize = alphabetSet.size(); + + int[] positions = TriePacker.construct(pool, charMap, alphabetSize); + + int lastPos = 0; + for (int p : positions) if (p > lastPos) lastPos = p; + int sz = lastPos + alphabetSize + 1; + + int[] base = new int[sz]; + int[] parent = new int[sz]; + float[] weight = new float[sz]; + int[] index = new int[sz]; // bigram row indices, -1 = unassigned + Arrays.fill(parent, -1); + Arrays.fill(index, -1); + + int[] mapping = new int[pool.size()]; + mapping[0] = 0; + + for (int i = 0; i < pool.size(); i++) { + TriePacker.HashNode node = pool.get(i); + int selfSlot = mapping[i]; + base[selfSlot] = positions[i]; + weight[selfSlot] = node.weight; + + for (Map.Entry entry : node.children.entrySet()) { + int cp = entry.getKey(); + int childIdx = entry.getValue(); + int ci = charMap[cp]; + int slot = base[selfSlot] + ci; + mapping[childIdx] = slot; + parent[slot] = selfSlot; + } + } + + return new SyllableTrie(charMap, base, parent, index, weight); + } +} diff --git a/java/coccoc-tokenizer-java/src/main/java/com/coccoc/internal/build/TriePacker.java b/java/coccoc-tokenizer-java/src/main/java/com/coccoc/internal/build/TriePacker.java new file mode 100644 index 0000000..53dc117 --- /dev/null +++ b/java/coccoc-tokenizer-java/src/main/java/com/coccoc/internal/build/TriePacker.java @@ -0,0 +1,213 @@ +package com.coccoc.internal.build; + +import com.coccoc.internal.trie.MultitermTrie; +import java.util.*; + +/** + * Packs a hash-based trie into a DA-trie (parallel arrays). + * Port of da_trie.hpp build_trie() + construct() — the construction half only. + * The runtime lookup half lives in DoubleArrayTrie. + */ +public final class TriePacker { + + // ----------------------------------------------------------------------- + // Intermediate hash-trie node (C++ HashTrieNode analog) + // ----------------------------------------------------------------------- + + public static final class HashNode { + public int frequency = -1; + public float weight = 0f; + public boolean isSpecial = false; + public int spaceCount = 0; + /** children: codepoint → node index in the flat list */ + public final TreeMap children = new TreeMap<>(); + + public boolean isEnding() { return frequency >= 0; } + } + + // ----------------------------------------------------------------------- + // Build hash trie from word list (for tests — uniform weight 1.0f) + // ----------------------------------------------------------------------- + + public static HashNode buildHashTrie(String[] words) { + List pool = new ArrayList<>(); + pool.add(new HashNode()); // root + + for (String word : words) { + int cur = 0; + for (int cp : word.codePoints().toArray()) { + HashNode node = pool.get(cur); + if (!node.children.containsKey(cp)) { + node.children.put(cp, pool.size()); + pool.add(new HashNode()); + } + cur = node.children.get(cp); + } + pool.get(cur).frequency = 1; + pool.get(cur).weight = 1.0f; + } + root_pool = pool; + return pool.get(0); + } + + // Package-private pool reference set by buildHashTrie (test helper). + static List root_pool; + + // ----------------------------------------------------------------------- + // Pack: hash trie → MultitermTrie + // ----------------------------------------------------------------------- + + public static MultitermTrie pack(HashNode root) { + if (root_pool == null) + throw new IllegalStateException("call buildHashTrie() before pack()"); + return packFromPool(root_pool); + } + + public static MultitermTrie packFromPool(List pool) { + // Build alphabet + TreeSet alphabetSet = new TreeSet<>(); + for (HashNode node : pool) alphabetSet.addAll(node.children.keySet()); + + int[] charMap; + if (alphabetSet.isEmpty()) { + charMap = new int[0]; + } else { + int maxCp = alphabetSet.last(); + charMap = new int[maxCp + 1]; + Arrays.fill(charMap, -1); + int idx = 0; + for (int cp : alphabetSet) charMap[cp] = idx++; + } + int alphabetSize = alphabetSet.size(); + + int[] positions = construct(pool, charMap, alphabetSize); + + // DA pool size: largest base + alphabet + int lastPos = 0; + for (int p : positions) if (p > lastPos) lastPos = p; + int sz = lastPos + alphabetSize + 1; + + int[] base = new int[sz]; + int[] parent = new int[sz]; + float[] weight = new float[sz]; + byte[] flags = new byte[sz]; + Arrays.fill(parent, -1); + + // mapping[hashNodeIdx] = DA pool slot index + int[] mapping = new int[pool.size()]; + mapping[0] = 0; + + for (int i = 0; i < pool.size(); i++) { + HashNode node = pool.get(i); + int selfSlot = mapping[i]; + base[selfSlot] = positions[i]; + if (node.isEnding()) flags[selfSlot] |= 1; + if (node.isSpecial) flags[selfSlot] |= 2; + weight[selfSlot] = node.weight; + + for (Map.Entry entry : node.children.entrySet()) { + int cp = entry.getKey(); + int childIdx = entry.getValue(); + int ci = charMap[cp]; + int slot = base[selfSlot] + ci; + mapping[childIdx] = slot; + parent[slot] = selfSlot; + } + } + + return new MultitermTrie(charMap, base, parent, weight, flags); + } + + // ----------------------------------------------------------------------- + // construct: find base positions for all hash-trie nodes. + // Port of da_trie.hpp:78-189. + // + // Invariant: pos[k] = sorted set of base positions p (p >= 1) where + // state[p + k] == false (i.e. slot k at base p is currently free). + // + // Extension: when disclosing new base position curEnd, state[curEnd + k] + // is freshly false for all k → add curEnd to every pos[k]. + // + // Removal: when occupying DA slot curPos = foundPos + offset, for every k + // remove (curPos - k) from pos[k] because state[curPos] is now true. + // ----------------------------------------------------------------------- + + static int[] construct(List pool, int[] charMap, int alphabetSize) { + if (alphabetSize == 0) return new int[pool.size()]; + + // pos[k] = sorted candidate base positions where slot k is free + @SuppressWarnings("unchecked") + TreeSet[] pos = new TreeSet[alphabetSize]; + for (int k = 0; k < alphabetSize; k++) { + pos[k] = new TreeSet<>(); + // Base position 1: state[1+k] is free for all k initially + pos[k].add(1); + } + + // state[p] = true → DA pool position p is occupied + boolean[] state = new boolean[alphabetSize + 2]; + // curEnd: next base position not yet disclosed into pos sets. + // We start at 2 because base=1 was seeded above. + int curEnd = 2; + + int[] res = new int[pool.size()]; + + for (int i = 0; i < pool.size(); i++) { + HashNode node = pool.get(i); + if (node.children.isEmpty()) { + res[i] = 0; + continue; + } + + // Build mask: sorted alphabet indices of children (rarest pos-set first) + Integer[] mask = new Integer[node.children.size()]; + int idx = 0; + for (int cp : node.children.keySet()) mask[idx++] = charMap[cp]; + Arrays.sort(mask, Comparator.comparingInt(ci -> pos[ci].size())); + + // Find smallest base p in pos[mask[0]] also present in all pos[mask[j]] + int foundPos = -1; + outer: + for (int p : pos[mask[0]]) { + for (int j = 1; j < mask.length; j++) { + if (!pos[mask[j]].contains(p)) continue outer; + } + foundPos = p; + break; + } + if (foundPos == -1) foundPos = curEnd; + res[i] = foundPos; + + // Disclose new base positions up through foundPos + max(mask) + int maxMask = 0; + for (int m : mask) if (m > maxMask) maxMask = m; + int affectedEnd = foundPos + maxMask; + + while (curEnd <= affectedEnd) { + // Grow state array to cover curEnd + (alphabetSize-1) + if (curEnd + alphabetSize >= state.length) { + state = Arrays.copyOf(state, curEnd + alphabetSize + 2); + } + // Disclose base=curEnd: state[curEnd+k] is false for all k → add curEnd to pos[k] + for (int k = 0; k < alphabetSize; k++) { + pos[k].add(curEnd); + } + curEnd++; + } + + // Mark child slots occupied and remove invalidated candidates from pos + for (int offset : mask) { + int curPos = foundPos + offset; + // state[curPos] becoming true: any base p = curPos - k with slot k pointed here + // is no longer valid → remove p from pos[k] + for (int k = 0; k < alphabetSize; k++) { + int basePos = curPos - k; + if (basePos >= 1) pos[k].remove(basePos); + } + state[curPos] = true; + } + } + + return res; + } +} diff --git a/java/coccoc-tokenizer-java/src/main/java/com/coccoc/internal/io/DictReader.java b/java/coccoc-tokenizer-java/src/main/java/com/coccoc/internal/io/DictReader.java new file mode 100644 index 0000000..f568a4d --- /dev/null +++ b/java/coccoc-tokenizer-java/src/main/java/com/coccoc/internal/io/DictReader.java @@ -0,0 +1,214 @@ +package com.coccoc.internal.io; + +import com.coccoc.internal.bigram.BigramScores; +import com.coccoc.internal.trie.MultitermTrie; +import com.coccoc.internal.trie.SyllableTrie; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.BufferUnderflowException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Arrays; +import java.util.zip.CRC32; + +/** + * Reads the three binary dict files written by DictCompile. + * Format per file: magic(4) | version=1(4) | payload... | crc32(4) — all LE. + * CRC covers every byte from position 4 (after magic) through end-of-payload; + * the magic itself and the trailing CRC bytes are excluded from the CRC window. + */ +public final class DictReader { + + // Upper bounds to prevent heap exhaustion from a crafted-but-CRC-valid .bin. + private static final int MAX_ALPHA_SIZE = 0x11_0000; // full Unicode range + private static final int MAX_NODE_COUNT = 10_000_000; + private static final int MAX_ROW_COUNT = 1_000_000; + private static final int MAX_NNZ = 10_000_000; + + private DictReader() {} + + // ========================================================================= + // Public filesystem readers (Path-based) + // ========================================================================= + + public static MultitermTrie readMultiterm(Path file) throws IOException { + return parseMultiterm(loadAndVerify(file, "CCMT"), file.getFileName().toString()); + } + + public static SyllableTrie readSyllable(Path file) throws IOException { + return parseSyllable(loadAndVerify(file, "CCSY"), file.getFileName().toString()); + } + + public static BigramScores readBigram(Path file) throws IOException { + return parseBigram(loadAndVerify(file, "CCBG"), file.getFileName().toString()); + } + + // ========================================================================= + // Public classpath readers (InputStream-based) + // ========================================================================= + + public static MultitermTrie readMultiterm(InputStream in, String name) throws IOException { + return parseMultiterm(loadAndVerify(in, "CCMT", name), name); + } + + public static SyllableTrie readSyllable(InputStream in, String name) throws IOException { + return parseSyllable(loadAndVerify(in, "CCSY", name), name); + } + + public static BigramScores readBigram(InputStream in, String name) throws IOException { + return parseBigram(loadAndVerify(in, "CCBG", name), name); + } + + // ========================================================================= + // Private parsers (operate on already-verified byte arrays) + // ========================================================================= + + private static MultitermTrie parseMultiterm(byte[] bytes, String name) throws IOException { + ByteBuffer buf = ByteBuffer.wrap(bytes, 8, bytes.length - 8).order(ByteOrder.LITTLE_ENDIAN); + try { + int alphaSize = buf.getInt(); + checkSize(alphaSize, MAX_ALPHA_SIZE, "alphaSize"); + int[] codepoints = new int[alphaSize]; + for (int i = 0; i < alphaSize; i++) codepoints[i] = buf.getInt(); + + int sz = buf.getInt(); + checkSize(sz, MAX_NODE_COUNT, "node count"); + int[] base = new int[sz]; + int[] parent = new int[sz]; + float[] weight = new float[sz]; + byte[] flags = new byte[sz]; + for (int i = 0; i < sz; i++) base[i] = buf.getInt(); + for (int i = 0; i < sz; i++) parent[i] = buf.getInt(); + for (int i = 0; i < sz; i++) weight[i] = buf.getFloat(); + buf.get(flags); + + // P0#3: replace NaN weights (e.g. from bundled dict entries) with NEGATIVE_INFINITY + // so they never silently poison the Viterbi DP comparison chain + for (int i = 0; i < sz; i++) { + if (Float.isNaN(weight[i])) weight[i] = Float.NEGATIVE_INFINITY; + } + return new MultitermTrie(buildCharMap(codepoints), base, parent, weight, flags); + } catch (BufferUnderflowException e) { + throw new IOException("truncated payload in " + name, e); + } + } + + private static SyllableTrie parseSyllable(byte[] bytes, String name) throws IOException { + ByteBuffer buf = ByteBuffer.wrap(bytes, 8, bytes.length - 8).order(ByteOrder.LITTLE_ENDIAN); + try { + int alphaSize = buf.getInt(); + checkSize(alphaSize, MAX_ALPHA_SIZE, "alphaSize"); + int[] codepoints = new int[alphaSize]; + for (int i = 0; i < alphaSize; i++) codepoints[i] = buf.getInt(); + + int sz = buf.getInt(); + checkSize(sz, MAX_NODE_COUNT, "node count"); + int[] base = new int[sz]; + int[] parent = new int[sz]; + float[] weight = new float[sz]; + int[] index = new int[sz]; + for (int i = 0; i < sz; i++) base[i] = buf.getInt(); + for (int i = 0; i < sz; i++) parent[i] = buf.getInt(); + for (int i = 0; i < sz; i++) weight[i] = buf.getFloat(); + for (int i = 0; i < sz; i++) index[i] = buf.getInt(); + + return new SyllableTrie(buildCharMap(codepoints), base, parent, index, weight); + } catch (BufferUnderflowException e) { + throw new IOException("truncated payload in " + name, e); + } + } + + private static BigramScores parseBigram(byte[] bytes, String name) throws IOException { + ByteBuffer buf = ByteBuffer.wrap(bytes, 8, bytes.length - 8).order(ByteOrder.LITTLE_ENDIAN); + try { + int n = buf.getInt(); + checkSize(n, MAX_ROW_COUNT, "rowCount"); + int[] rowOffset = new int[n + 1]; + for (int i = 0; i <= n; i++) rowOffset[i] = buf.getInt(); + + // P0#2: validate monotone invariant before allocating colIndex/value arrays + for (int i = 0; i < n; i++) { + if (rowOffset[i] > rowOffset[i + 1]) + throw new IOException("rowOffset invariant violated at row " + i + + ": offset[" + i + "]=" + rowOffset[i] + + " > offset[" + (i + 1) + "]=" + rowOffset[i + 1]); + } + int totalNnz = rowOffset[n]; + checkSize(totalNnz, MAX_NNZ, "nnz count"); + int[] colIndex = new int[totalNnz]; + float[] value = new float[totalNnz]; + for (int i = 0; i < totalNnz; i++) colIndex[i] = buf.getInt(); + for (int i = 0; i < totalNnz; i++) value[i] = buf.getFloat(); + + return new BigramScores(rowOffset, colIndex, value); + } catch (BufferUnderflowException e) { + throw new IOException("truncated payload in " + name, e); + } + } + + // ========================================================================= + // Shared low-level helpers (package-private for tests) + // ========================================================================= + + /** Load file bytes and verify magic, version, and CRC before parsing. */ + static byte[] loadAndVerify(Path file, String expectedMagic) throws IOException { + return verifyBytes(Files.readAllBytes(file), expectedMagic, file.getFileName().toString()); + } + + /** Load stream bytes and verify magic, version, and CRC before parsing. */ + static byte[] loadAndVerify(InputStream in, String expectedMagic, String name) throws IOException { + return verifyBytes(in.readAllBytes(), expectedMagic, name); + } + + private static byte[] verifyBytes(byte[] bytes, String expectedMagic, String name) throws IOException { + if (bytes.length < 12) + throw new IOException("truncated: " + name + " too short"); + + if (bytes[0] != expectedMagic.charAt(0) || bytes[1] != expectedMagic.charAt(1) + || bytes[2] != expectedMagic.charAt(2) || bytes[3] != expectedMagic.charAt(3)) + throw new IOException("bad magic: expected " + expectedMagic + + ", got " + new String(bytes, 0, 4)); + + int version = leInt(bytes, 4); + if (version != 1) + throw new IOException("version mismatch: expected 1, got " + version); + + CRC32 crc = new CRC32(); + crc.update(bytes, 4, bytes.length - 8); + int stored = leInt(bytes, bytes.length - 4); + if ((int) crc.getValue() != stored) + throw new IOException("crc mismatch: payload corrupted"); + + return bytes; + } + + /** Reconstruct charMap (codepoint→alphabet-index) from on-disk codepoints array. */ + static int[] buildCharMap(int[] codepoints) throws IOException { + if (codepoints.length == 0) return new int[0]; + int maxCp = 0; + for (int cp : codepoints) { + if (cp < 0 || cp >= MAX_ALPHA_SIZE) + throw new IOException("invalid codepoint in dict: " + cp); + if (cp > maxCp) maxCp = cp; + } + int[] charMap = new int[maxCp + 1]; + Arrays.fill(charMap, -1); + for (int i = 0; i < codepoints.length; i++) charMap[codepoints[i]] = i; + return charMap; + } + + private static void checkSize(int n, int max, String what) throws IOException { + if (n < 0 || n > max) + throw new IOException("truncated: implausible " + what + ": " + n); + } + + private static int leInt(byte[] b, int off) { + return (b[off] & 0xFF) + | ((b[off + 1] & 0xFF) << 8) + | ((b[off + 2] & 0xFF) << 16) + | ((b[off + 3] & 0xFF) << 24); + } +} diff --git a/java/coccoc-tokenizer-java/src/main/java/com/coccoc/internal/io/VarintReader.java b/java/coccoc-tokenizer-java/src/main/java/com/coccoc/internal/io/VarintReader.java new file mode 100644 index 0000000..3a03c05 --- /dev/null +++ b/java/coccoc-tokenizer-java/src/main/java/com/coccoc/internal/io/VarintReader.java @@ -0,0 +1,68 @@ +package com.coccoc.internal.io; + +import java.io.IOException; +import java.io.InputStream; + +/** + * Port of BufferedReader::next_int() from tokenizer/auxiliary/buffered_reader.hpp:44-67. + * + * Encoding (from C++ comment): little-endian, where the FIRST byte of each integer + * has bit 7 = 0 (0xxxxxxx), and CONTINUATION bytes have bit 7 = 1 (1xxxxxxx). + * A byte with bit 7 = 0 encountered when power > 0 terminates the current integer + * and starts the next one (it is saved and consumed first on the next call). + * + * This is the INVERSE of standard LEB128: in LEB128 bit-7=1 means "continue", + * here bit-7=1 means "this is a continuation byte" while bit-7=0 means "new integer". + */ +public final class VarintReader implements AutoCloseable { + + private final InputStream in; + private int savedByte = -1; // -1 = no saved byte (mirrors C++ last_byte_read = 0xFF) + + public VarintReader(InputStream in) { + this.in = in; + } + + /** + * Read the next integer from the varint stream. + * + * @return decoded integer, or -1 if EOF (no more data) + * @throws IOException on stream read errors + */ + public int nextInt() throws IOException { + int res; + int power; + + if (savedByte >= 0) { + // Restore the byte that was peeked by the previous call + res = savedByte & 0x7F; + power = 7; + savedByte = -1; + } else { + // Read the first byte of this integer (high bit must be 0) + int d = in.read(); + if (d == -1) return -1; + res = d & 0x7F; + power = 7; + } + + // Read continuation bytes (high bit = 1) until we see a new-start byte (high bit = 0) + while (true) { + int d = in.read(); + if (d == -1) break; + if ((d & 0x80) == 0) { + // This byte starts the next integer — save it for the next call + savedByte = d; + break; + } + res |= (d & 0x7F) << power; + power += 7; + } + return res; + } + + @Override + public void close() throws IOException { + in.close(); + } +} diff --git a/java/coccoc-tokenizer-java/src/main/java/com/coccoc/internal/lang/VnLangTool.java b/java/coccoc-tokenizer-java/src/main/java/com/coccoc/internal/lang/VnLangTool.java new file mode 100644 index 0000000..8fc2db1 --- /dev/null +++ b/java/coccoc-tokenizer-java/src/main/java/com/coccoc/internal/lang/VnLangTool.java @@ -0,0 +1,344 @@ +package com.coccoc.internal.lang; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.util.Arrays; + +/** + * Port of VnLangTool from tokenizer/auxiliary/vn_lang_tool.hpp. + * + * All table indices are BMP codepoints (0..65535). Astral codepoints (≥ 0x10000) + * are handled by short-circuiting bounds checks — they pass through unchanged, + * matching the C++ guard "return c < ALPHANUMERIC_SIZE ? table[c] : c". + * + * Thread safety: call initSimple() or init(dictPath) once before any tokenization. + * Both are idempotent via the initialized flag. + */ +public final class VnLangTool { + + public static final int ALPHANUMERIC_SIZE = 1 << 16; + + // Sentinel: table entry not set (matches C++ memset -1 pattern). + private static final int UNSET = -1; + + // ----------------------------------------------------------------------- + // Vietnamese charset constants — from vn_lang_tool.hpp:32-33 + // ----------------------------------------------------------------------- + + private static final String VN_LOWER_CHARSET = + "áàảãạâấầẩẫậăắằẳẵặéèẻẽẹêếềểễệíìỉĩịóòỏõọôốồổỗộơớờởỡợúùủũụưứừửữựýỳỷỹỵđđ"; + private static final String VN_UPPER_CHARSET = + "ÁÀẢÃẠÂẤẦẨẪẬĂẮẰẲẴẶÉÈẺẼẸÊẾỀỂỄỆÍÌỈĨỊÓÒỎÕỌÔỐỒỔỖỘƠỚỜỞỠỢÚÙỦŨỤƯỨỪỬỮỰÝỲỶỸỴĐÐ"; + + // root_forms[14] — vn_lang_tool.hpp:34-47 + private static final String[] ROOT_FORMS = { + "aáàảãạâấầẩẫậăắằẳẵặ", + "eéèẻẽẹêếềểễệ", + "iíìỉĩị", + "oóòỏõọôốồổỗộơớờởỡợ", + "uúùủũụưứừửữự", + "yýỳỷỹỵ", + "dđđ", + "AÁÀẢÃẠÂẤẦẨẪẬĂẮẰẲẴẶ", + "EÉÈẺẼẸÊẾỀỂỄỆ", + "IÍÌỈĨỊ", + "OÓÒỎÕỌÔỐỒỔỖỘƠỚỜỞỠỢ", + "UÚÙỦŨỤƯỨỪỬỮỰ", + "YÝỲỶỸỴ", + "DĐÐ" + }; + + // tone_forms[24] — vn_lang_tool.hpp:49-72 + // Each group: [no-tone, sắc, huyền, hỏi, ngã, nặng] + private static final String[] TONE_FORMS = { + "aáàảãạ", "âấầẩẫậ", "ăắằẳẵặ", + "eéèẻẽẹ", "êếềểễệ", + "iíìỉĩị", + "oóòỏõọ", "ôốồổỗộ", "ơớờởỡợ", + "uúùủũụ", "ưứừửữự", + "yýỳỷỹỵ", + "AÁÀẢÃẠ", "ÂẤẦẨẪẬ", "ĂẮẰẲẴẶ", + "EÉÈẺẼẸ", "ÊẾỀỂỄỆ", + "IÍÌỈĨỊ", + "OÓÒỎÕỌ", "ÔỐỒỔỖỘ", "ƠỚỜỞỠỢ", + "UÚÙỦŨỤ", "ƯỨỪỬỮỰ", + "YÝỲỶỸỴ" + }; + + // hat_forms[24] — vn_lang_tool.hpp:75-100 + // Each group: [no-hat, circumflex ^, breve ˘, horn ̛] + private static final String[] HAT_FORMS = { + "aâăa", "áấắá", "àầằà", "ảẩẳả", "ãẫẵã", "ạậặạ", + "eêee", "éếéé", "èềèè", "ẻểẻẻ", "ẽễẽẽ", "ẹệẹẹ", + "oôoơ", "óốóớ", "òồòờ", "ỏổỏở", "õỗõỡ", "ọộọợ", + "uuuư", "úúúứ", "ùùùừ", "ủủủử", "ũũũữ", "ụụụự", + }; + + // ----------------------------------------------------------------------- + // Tables (package-private so Segmenter can read them directly) + // ----------------------------------------------------------------------- + + static final int[] lowerOf = new int[ALPHANUMERIC_SIZE]; + static final int[] upperOf = new int[ALPHANUMERIC_SIZE]; + static final int[] rootOf = new int[ALPHANUMERIC_SIZE]; + static final int[] lowerRootOf = new int[ALPHANUMERIC_SIZE]; + + // tone_id[c] = group index (0-23) if c is the toneless base of a group; else UNSET + static final int[] toneId = new int[ALPHANUMERIC_SIZE]; + // hat_id[c] = group index (0-23) if c is the hatless base of a group; else UNSET + static final int[] hatId = new int[ALPHANUMERIC_SIZE]; + // toneFormsId[c] = position index (1-5) if c is a combining tone mark; else UNSET + static final int[] toneFormsId = new int[ALPHANUMERIC_SIZE]; + // hatFormsId[c] = position index (1-3) if c is a combining hat mark; else UNSET + static final int[] hatFormsId = new int[ALPHANUMERIC_SIZE]; + + // toneFormsUtf[group][pos] = resulting codepoint + static final int[][] toneFormsUtf = new int[24][]; + static final int[][] hatFormsUtf = new int[24][]; + + static final boolean[] inAlphabet = new boolean[ALPHANUMERIC_SIZE]; + static final boolean[] inNumeric = new boolean[ALPHANUMERIC_SIZE]; + static final boolean[] inAlphanumeric = new boolean[ALPHANUMERIC_SIZE]; + + private static volatile boolean initialized = false; + + private VnLangTool() {} + + // ----------------------------------------------------------------------- + // Public init entry points + // ----------------------------------------------------------------------- + + /** + * Simple init: uses hard-coded VN charsets and ASCII. No dict files required. + * Equivalent to C++ init(path, simple_mode=true). + */ + public static synchronized void initSimple() { + if (initialized) return; + initSimpleAlphanumeric(); + initLowerUpper(); + initRootForms(); + initToneForms(); + initHatForms(); + initialized = true; + } + + /** + * Full init: reads alphabetic/numeric/d_and_gi/i_and_y dict files from dictPath, + * then builds tone/hat/root tables. + * Equivalent to C++ init(path, simple_mode=false). + */ + public static synchronized void init(String dictPath) throws IOException { + if (initialized) return; + initAlphanumericFromFiles(dictPath); + initLowerUpper(); + initRootForms(); + initToneForms(); + initHatForms(); + initialized = true; + } + + // ----------------------------------------------------------------------- + // Public utility methods + // ----------------------------------------------------------------------- + + public static int lower(int c) { + return c < ALPHANUMERIC_SIZE ? lowerOf[c] : c; + } + + public static int lowerRoot(int c) { + return c < ALPHANUMERIC_SIZE ? lowerRootOf[c] : c; + } + + public static boolean isAlphabetic(int c) { + return c < ALPHANUMERIC_SIZE && inAlphabet[c]; + } + + public static boolean isNumeric(int c) { + return c < ALPHANUMERIC_SIZE && inNumeric[c]; + } + + public static boolean isAlphanumeric(int c) { + return c < ALPHANUMERIC_SIZE && inAlphanumeric[c]; + } + + public static boolean isToneHat(int c) { + return c < ALPHANUMERIC_SIZE && (toneFormsId[c] != UNSET || hatFormsId[c] != UNSET); + } + + public static boolean canPutToneHat(int c) { + return c < ALPHANUMERIC_SIZE && (toneId[c] != UNSET || hatId[c] != UNSET); + } + + /** + * Attempt to merge a Vietnamese combining mark into the preceding codepoint. + * + * @return the merged codepoint, or -1 if no merge is possible. + * + * Mirrors C++ merge_tone_hat(uint32_t &prev_char, uint32_t cur_char) which + * modifies prev_char in place. Java callers use the return value instead: + * int merged = VnLangTool.mergeToneHat(prev, cur); + * if (merged != -1) prev = merged; + */ + public static int mergeToneHat(int prevChar, int curChar) { + if (prevChar >= ALPHANUMERIC_SIZE || curChar >= ALPHANUMERIC_SIZE) return UNSET; + if (toneId[prevChar] != UNSET && toneFormsId[curChar] != UNSET) { + return toneFormsUtf[toneId[prevChar]][toneFormsId[curChar]]; + } + if (hatId[prevChar] != UNSET && hatFormsId[curChar] != UNSET) { + return hatFormsUtf[hatId[prevChar]][hatFormsId[curChar]]; + } + return UNSET; + } + + /** + * Normalize an NFD codepoint sequence — merge combining tone/hat marks into + * the preceding vowel, producing a NFC-like result. + * Mirrors C++ normalize_NFD_UTF(text, remove_duplicate_spaces). + */ + public static int[] normalizeNfd(int[] cps) { + return normalizeNfd(cps, false); + } + + public static int[] normalizeNfd(int[] cps, boolean removeDuplicateSpaces) { + if (cps.length == 0) return new int[0]; + int[] out = new int[cps.length]; + int len = 0; + out[len++] = cps[0]; + for (int i = 1; i < cps.length; i++) { + int prevChar = out[len - 1]; + int curChar = cps[i]; + int merged = mergeToneHat(prevChar, curChar); + if (merged != UNSET) { + out[len - 1] = merged; + } else { + if (removeDuplicateSpaces && curChar == ' ' && out[len - 1] == ' ') continue; + out[len++] = curChar; + } + } + return len == out.length ? out : Arrays.copyOf(out, len); + } + + // ----------------------------------------------------------------------- + // Init helpers + // ----------------------------------------------------------------------- + + private static void initSimpleAlphanumeric() { + for (int i = 0; i <= 9; i++) { + inNumeric['0' + i] = true; + inAlphanumeric['0' + i] = true; + } + for (int i = 0; i < 26; i++) { + inAlphabet['A' + i] = true; + inAlphabet['a' + i] = true; + inAlphanumeric['A' + i] = true; + inAlphanumeric['a' + i] = true; + } + int[] lowers = VN_LOWER_CHARSET.codePoints().toArray(); + int[] uppers = VN_UPPER_CHARSET.codePoints().toArray(); + for (int i = 0; i < lowers.length; i++) { + inAlphabet[lowers[i]] = true; + inAlphabet[uppers[i]] = true; + inAlphanumeric[lowers[i]] = true; + inAlphanumeric[uppers[i]] = true; + } + } + + private static void initAlphanumericFromFiles(String dictPath) throws IOException { + readLetterFile(dictPath + "/alphabetic", true); + readLetterFile(dictPath + "/numeric", false); + } + + private static void readLetterFile(String path, boolean isAlpha) throws IOException { + try (BufferedReader br = new BufferedReader(new FileReader(path))) { + int n = Integer.parseInt(br.readLine().trim()); + for (int i = 0; i < n; i++) { + String line = br.readLine(); + if (line == null) break; + String[] parts = line.trim().split("\\s+"); + if (parts.length < 4) continue; + int upperCp = Integer.parseInt(parts[1]); + int lowerCp = Integer.parseInt(parts[3]); + if (Math.max(upperCp, lowerCp) >= ALPHANUMERIC_SIZE) continue; + if (isAlpha) { + inAlphabet[upperCp] = true; + inAlphabet[lowerCp] = true; + } else { + inNumeric[upperCp] = true; + inNumeric[lowerCp] = true; + } + inAlphanumeric[upperCp] = true; + inAlphanumeric[lowerCp] = true; + if (upperCp != lowerCp) { + upperOf[lowerCp] = upperCp; + lowerOf[upperCp] = lowerCp; + } + } + } + } + + private static void initLowerUpper() { + // Identity by default + for (int i = 0; i < ALPHANUMERIC_SIZE; i++) { + lowerOf[i] = i; + upperOf[i] = i; + } + // ASCII A-Z / a-z + for (int i = 0; i < 26; i++) { + lowerOf['A' + i] = 'a' + i; + upperOf['a' + i] = 'A' + i; + } + // VN uppercase ↔ lowercase pairs + int[] lowers = VN_LOWER_CHARSET.codePoints().toArray(); + int[] uppers = VN_UPPER_CHARSET.codePoints().toArray(); + for (int i = 0; i < lowers.length; i++) { + lowerOf[uppers[i]] = lowers[i]; + upperOf[lowers[i]] = uppers[i]; + } + } + + private static void initRootForms() { + for (int i = 0; i < ALPHANUMERIC_SIZE; i++) { + rootOf[i] = i; + lowerRootOf[i] = lowerOf[i]; + } + for (String group : ROOT_FORMS) { + int[] cps = group.codePoints().toArray(); + int root = cps[0]; + for (int cp : cps) { + rootOf[cp] = root; + lowerRootOf[cp] = lowerOf[root]; + } + } + } + + private static void initToneForms() { + Arrays.fill(toneFormsId, UNSET); + Arrays.fill(toneId, UNSET); + for (int i = 0; i < TONE_FORMS.length; i++) { + int[] cps = TONE_FORMS[i].codePoints().toArray(); + toneId[cps[0]] = i; // Only the toneless base gets a group id + toneFormsUtf[i] = cps; + } + // Combining tone marks → position indices (1-5) + toneFormsId[0x301] = 1; // U+0301 COMBINING ACUTE ACCENT (sắc) + toneFormsId[0x300] = 2; // U+0300 COMBINING GRAVE ACCENT (huyền) + toneFormsId[0x309] = 3; // U+0309 COMBINING HOOK ABOVE (hỏi) + toneFormsId[0x303] = 4; // U+0303 COMBINING TILDE (ngã) + toneFormsId[0x323] = 5; // U+0323 COMBINING DOT BELOW (nặng) + } + + private static void initHatForms() { + Arrays.fill(hatFormsId, UNSET); + Arrays.fill(hatId, UNSET); + for (int i = 0; i < HAT_FORMS.length; i++) { + int[] cps = HAT_FORMS[i].codePoints().toArray(); + hatId[cps[0]] = i; // Only the hat-less base gets a group id + hatFormsUtf[i] = cps; + } + // Combining hat/modifier marks → position indices (1-3) + hatFormsId[0x302] = 1; // U+0302 COMBINING CIRCUMFLEX ACCENT (^) + hatFormsId[0x306] = 2; // U+0306 COMBINING BREVE (ă group) + hatFormsId[0x31b] = 3; // U+031B COMBINING HORN (ơ/ư group) + } +} diff --git a/java/coccoc-tokenizer-java/src/main/java/com/coccoc/internal/segment/Segmenter.java b/java/coccoc-tokenizer-java/src/main/java/com/coccoc/internal/segment/Segmenter.java new file mode 100644 index 0000000..162bd52 --- /dev/null +++ b/java/coccoc-tokenizer-java/src/main/java/com/coccoc/internal/segment/Segmenter.java @@ -0,0 +1,280 @@ +package com.coccoc.internal.segment; + +import com.coccoc.Token; +import com.coccoc.TokenizeOption; +import com.coccoc.internal.lang.VnLangTool; +import com.coccoc.internal.bigram.BigramScores; +import com.coccoc.internal.trie.MultitermTrie; +import com.coccoc.internal.trie.SyllableTrie; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +/** + * Pure-Java Viterbi segmenter. + * Forward DP over MultitermTrie; single-char fallback for trie misses; + * consecutive same-type spans merged during traceback. + */ +public final class Segmenter { + + private static final Set ORDINAL_SUFFIXES = new HashSet<>(Arrays.asList("st", "nd", "rd", "th")); + + private final MultitermTrie multitermTrie; + private final SyllableTrie syllableTrie; // null if sticky segmentation not available + private final BigramScores bigramScores; // null if not loaded + + public Segmenter(MultitermTrie multitermTrie) { + this(multitermTrie, null, null); + } + + public Segmenter(MultitermTrie multitermTrie, SyllableTrie syllableTrie, + BigramScores bigramScores) { + this.multitermTrie = multitermTrie; + this.syllableTrie = syllableTrie; + this.bigramScores = bigramScores; + } + + public List segment(String text, TokenizeOption option, boolean keepPunct) { + if (option == TokenizeOption.HOST) return segmentHost(text.codePoints().toArray()); + if (option == TokenizeOption.URL) return segmentUrl(text); + List raw = segment(text); + if (keepPunct) { + return raw.stream() + .filter(t -> t.getType() != Token.Type.SPACE) + .collect(java.util.stream.Collectors.toList()); + } + return raw.stream() + .filter(t -> t.getType() != Token.Type.SPACE && t.getType() != Token.Type.PUNCT) + .collect(java.util.stream.Collectors.toList()); + } + + // HOST mode: split on '.', return each non-empty label as WORD + private List segmentHost(int[] cps) { + List tokens = new ArrayList<>(); + int start = 0; + for (int i = 0; i <= cps.length; i++) { + if (i == cps.length || cps[i] == '.') { + if (i > start) { + String part = new String(cps, start, i - start); + tokens.add(new Token(part, Token.Type.WORD, start, i)); + } + start = i + 1; + } + } + return tokens; + } + + // URL mode: strip http(s):// scheme, then segment each alphanumeric run + private List segmentUrl(String text) { + // Strip well-known URL scheme prefixes + String stripped = text; + if (stripped.startsWith("https://")) stripped = stripped.substring(8); + else if (stripped.startsWith("http://")) stripped = stripped.substring(7); + + int[] cps = stripped.codePoints().toArray(); + List tokens = new ArrayList<>(); + int start = 0; + for (int i = 0; i <= cps.length; i++) { + boolean isSep = (i == cps.length) || !VnLangTool.isAlphanumeric(cps[i]); + if (isSep) { + if (i > start) { + String part = new String(cps, start, i - start); + Token.Type type = classifySpan(cps, start, i); + tokens.add(new Token(part, type, start, i)); + } + start = i + 1; + } + } + return tokens; + } + + public List segment(String text) { + if (text.isEmpty()) return Collections.emptyList(); + + int[] cps = VnLangTool.normalizeNfd(text.codePoints().toArray()); + int n = cps.length; + + float[] best = new float[n + 1]; + Arrays.fill(best, Float.NEGATIVE_INFINITY); + best[0] = 0.0f; + int[] trace = new int[n + 1]; + Arrays.fill(trace, -1); + + // shouldGo mirrors the C++ `should_go` flag: only scan from positions that are + // explicit token-boundary anchors, not interior positions of an ongoing match. + boolean[] shouldGo = new boolean[n + 1]; + shouldGo[0] = true; + + for (int i = 0; i < n; i++) { + if (best[i] == Float.NEGATIVE_INFINITY) continue; + + if (shouldGo[i]) { + // Trie scan: find all multi-char matches starting at i. + // Track the furthest position that actually updated best[] so we + // can set shouldGo only at that boundary, preventing interior + // positions from being re-scanned (C++ should_go semantics). + int node = 0; + int furthestUpdated = -1; + for (int j = i; j < n; j++) { + node = multitermTrie.findChild(node, VnLangTool.lower(cps[j])); + if (node == -1) break; + if (j > i && multitermTrie.isEnding(node)) { + float w = best[i] + multitermTrie.getWeight(node); + if (w > best[j + 1]) { + best[j + 1] = w; + trace[j + 1] = i; + furthestUpdated = j; + } + } + } + + if (furthestUpdated >= 0) { + shouldGo[furthestUpdated + 1] = true; + } else { + // No multi-char match: single-char step, scan may continue from i+1. + if (i + 1 <= n && best[i + 1] == Float.NEGATIVE_INFINITY) { + best[i + 1] = best[i]; + trace[i + 1] = i; + } + shouldGo[i + 1] = true; + } + } else { + // Interior position: carry score forward but do not scan trie. + if (i + 1 <= n && best[i + 1] == Float.NEGATIVE_INFINITY) { + best[i + 1] = best[i]; + trace[i + 1] = i; + } + } + } + + // Traceback: collect (start, end) spans in reverse + List spans = new ArrayList<>(); + for (int pos = n; pos > 0; ) { + int start = trace[pos]; + spans.add(new int[]{start, pos}); + pos = start; + } + Collections.reverse(spans); + + // Convert spans to tokens; merge consecutive same-type WORD/NUMBER spans + List tokens = new ArrayList<>(); + for (int[] span : spans) { + String spanText = new String(cps, span[0], span[1] - span[0]); + Token.Type type = classifySpan(cps, span[0], span[1]); + if (!tokens.isEmpty()) { + Token last = tokens.get(tokens.size() - 1); + if (canMerge(last.getType(), type)) { + tokens.set(tokens.size() - 1, + new Token(last.getText() + spanText, type, last.getPos(), span[1])); + continue; + } + } + tokens.add(new Token(spanText, type, span[0], span[1])); + } + return applyPostHocRules(tokens); + } + + /** + * Splits a sticky (no-spaces) codepoint sequence into Vietnamese syllables. + * Runs Viterbi DP on the SyllableTrie; single-char fallback for unknown chars. + * Returns the list of syllable strings in order. + */ + public List splitSyllables(String text) { + if (syllableTrie == null) return Collections.singletonList(text); + int[] cps = VnLangTool.normalizeNfd(text.codePoints().toArray()); + int n = cps.length; + if (n == 0) return Collections.emptyList(); + + float[] best = new float[n + 1]; + int[] trace = new int[n + 1]; + int[] sylAt = new int[n + 1]; // syllable index ending at each position (-1 if none) + Arrays.fill(best, Float.NEGATIVE_INFINITY); + Arrays.fill(trace, -1); + Arrays.fill(sylAt, -1); + best[0] = 0.0f; + + for (int i = 0; i < n; i++) { + if (best[i] == Float.NEGATIVE_INFINITY) continue; + + int node = 0; + for (int j = i; j < n; j++) { + node = syllableTrie.findChild(node, cps[j]); + if (node == -1) break; + boolean isSyl = syllableTrie.getIndex(node) >= 0 + || syllableTrie.getWeight(node) > 0.0f; + if (isSyl) { + float w = best[i] + syllableTrie.getWeight(node); + int curIdx = syllableTrie.getIndex(node); + // Add bigram bonus when both syllable indices are known + if (bigramScores != null && sylAt[i] >= 0 && curIdx >= 0) { + w += bigramScores.getScore(sylAt[i], curIdx); + } + if (w > best[j + 1]) { + best[j + 1] = w; + trace[j + 1] = i; + sylAt[j + 1] = curIdx; + } + } + } + + // Single-char fallback with heavy penalty + if (i + 1 <= n && best[i + 1] == Float.NEGATIVE_INFINITY) { + best[i + 1] = best[i] - 1_000.0f; + trace[i + 1] = i; + sylAt[i + 1] = -1; + } + } + + // Traceback + List result = new ArrayList<>(); + for (int pos = n; pos > 0; ) { + int start = trace[pos]; + result.add(new String(cps, start, pos - start)); + pos = start; + } + Collections.reverse(result); + return result; + } + + private Token.Type classifySpan(int[] cps, int from, int to) { + if (to - from == 1 && cps[from] == ' ') return Token.Type.SPACE; + boolean hasAlpha = false, hasNumeric = false; + for (int i = from; i < to; i++) { + if (VnLangTool.isAlphabetic(cps[i])) { hasAlpha = true; break; } + if (VnLangTool.isNumeric(cps[i])) hasNumeric = true; + } + if (hasAlpha) return Token.Type.WORD; + if (hasNumeric) return Token.Type.NUMBER; + return Token.Type.PUNCT; + } + + private boolean canMerge(Token.Type a, Token.Type b) { + return a == b && (a == Token.Type.WORD || a == Token.Type.NUMBER); + } + + // Post-hoc token merging rules applied after Viterbi traceback. + private List applyPostHocRules(List tokens) { + for (int i = tokens.size() - 2; i >= 0; i--) { + Token cur = tokens.get(i); + Token next = tokens.get(i + 1); + if (cur.getType() != Token.Type.NUMBER) continue; + + boolean mergePercent = next.getType() == Token.Type.PUNCT + && "%".equals(next.getText()); + boolean mergeOrdinal = next.getType() == Token.Type.WORD + && ORDINAL_SUFFIXES.contains(next.getText().toLowerCase()); + + if (mergePercent || mergeOrdinal) { + String merged = cur.getText() + next.getText(); + tokens.set(i, new Token(merged, Token.Type.WORD, cur.getPos(), next.getEndPos())); + tokens.remove(i + 1); + } + } + return tokens; + } + +} \ No newline at end of file diff --git a/java/coccoc-tokenizer-java/src/main/java/com/coccoc/internal/trie/DoubleArrayTrie.java b/java/coccoc-tokenizer-java/src/main/java/com/coccoc/internal/trie/DoubleArrayTrie.java new file mode 100644 index 0000000..91275d6 --- /dev/null +++ b/java/coccoc-tokenizer-java/src/main/java/com/coccoc/internal/trie/DoubleArrayTrie.java @@ -0,0 +1,58 @@ +package com.coccoc.internal.trie; + +/** + * Read-only Double-Array Trie lookup engine. + * + * Port of the runtime portion of da_trie.hpp. Construction (the build_all / + * construct algorithm) lives in DictCompile (M4). This class only provides + * node traversal for use by the Segmenter (M6+). + * + * SoA layout: charMap, base[], parent[] kept as primitive int[] to avoid + * per-element object-header overhead for million-node trie pools. + */ +public class DoubleArrayTrie { + + /** Codepoint → alphabet index (-1 if codepoint not in alphabet). */ + protected final int[] charMap; + + /** base[u]: offset from which children of node u are addressed. */ + protected final int[] base; + + /** parent[u]: parent node id (or -1 for root / unoccupied slots). */ + protected final int[] parent; + + protected DoubleArrayTrie(int[] charMap, int[] base, int[] parent) { + this.charMap = charMap; + this.base = base; + this.parent = parent; + } + + /** + * Walk one edge of the trie. + * + * @param u current node id + * @param cp codepoint of the character to follow + * @return child node id, or -1 if no such child exists + */ + public int findChild(int u, int cp) { + if (cp >= charMap.length) return -1; + int ci = charMap[cp]; + if (ci == -1) return -1; + int childIdx = base[u] + ci; + if (childIdx >= 0 && childIdx < parent.length && parent[childIdx] == u) { + return childIdx; + } + return -1; + } + + /** @return true if node u is occupied (has a valid parent pointer). */ + public boolean isValidNode(int u) { + return u >= 0 && u < parent.length && parent[u] >= 0; + } + + // Accessors for DictWriter + public int[] charMapArray() { return charMap; } + public int[] baseArray() { return base; } + public int[] parentArray() { return parent; } + +} diff --git a/java/coccoc-tokenizer-java/src/main/java/com/coccoc/internal/trie/MultitermTrie.java b/java/coccoc-tokenizer-java/src/main/java/com/coccoc/internal/trie/MultitermTrie.java new file mode 100644 index 0000000..894d873 --- /dev/null +++ b/java/coccoc-tokenizer-java/src/main/java/com/coccoc/internal/trie/MultitermTrie.java @@ -0,0 +1,32 @@ +package com.coccoc.internal.trie; + +/** + * DA-trie for the main Vietnamese dictionary (multiterm + acronyms + chemicals). + * Port of MultitermDATrie from multiterm_da_trie.hpp / multiterm_da_trie_node.hpp. + * + * Each node carries a weight (log-probability) plus two boolean flags: + * bit 0 = is_ending (this node terminates a valid dictionary entry) + * bit 1 = is_special (special-token entry, e.g. chemical compound) + */ +public final class MultitermTrie extends DoubleArrayTrie { + + private final float[] weight; + /** flags[u] bit-packed: bit 0 = isEnding, bit 1 = isSpecial. */ + private final byte[] flags; + + public MultitermTrie(int[] charMap, int[] base, int[] parent, + float[] weight, byte[] flags) { + super(charMap, base, parent); + this.weight = weight; + this.flags = flags; + } + + public float getWeight(int u) { return weight[u]; } + public boolean isEnding(int u) { return (flags[u] & 1) != 0; } + public boolean isSpecial(int u) { return (flags[u] & 2) != 0; } + + // Accessors for DictWriter + public float[] weightArray() { return weight; } + public byte[] flagsArray() { return flags; } + +} diff --git a/java/coccoc-tokenizer-java/src/main/java/com/coccoc/internal/trie/StringSetTrie.java b/java/coccoc-tokenizer-java/src/main/java/com/coccoc/internal/trie/StringSetTrie.java new file mode 100644 index 0000000..22a9fce --- /dev/null +++ b/java/coccoc-tokenizer-java/src/main/java/com/coccoc/internal/trie/StringSetTrie.java @@ -0,0 +1,31 @@ +package com.coccoc.internal.trie; + +/** + * DA-trie for a fixed set of strings (used for TLD whitelist in helper.hpp). + * Port of StringSetTrie from string_set_trie.hpp. + * + * Only needs membership testing (contains), no weights or indices. + */ +public final class StringSetTrie extends DoubleArrayTrie { + + /** ending[u] != 0 if node u terminates a set member. */ + private final byte[] ending; + + public StringSetTrie(int[] charMap, int[] base, int[] parent, byte[] ending) { + super(charMap, base, parent); + this.ending = ending; + } + + /** + * Test whether cps[offset..offset+length) is a member of the set. + * Mirrors C++ StringSetTrie::contains(const uint32_t* text, int length). + */ + public boolean contains(int[] cps, int offset, int length) { + int node = 0; + for (int i = offset; i < offset + length; i++) { + node = findChild(node, cps[i]); + if (node == -1) return false; + } + return node >= 0 && node < ending.length && ending[node] != 0; + } +} diff --git a/java/coccoc-tokenizer-java/src/main/java/com/coccoc/internal/trie/SyllableTrie.java b/java/coccoc-tokenizer-java/src/main/java/com/coccoc/internal/trie/SyllableTrie.java new file mode 100644 index 0000000..f098441 --- /dev/null +++ b/java/coccoc-tokenizer-java/src/main/java/com/coccoc/internal/trie/SyllableTrie.java @@ -0,0 +1,34 @@ +package com.coccoc.internal.trie; + +/** + * DA-trie for Vietnamese syllables, used by the sticky-tokenization path. + * Port of SyllableDATrie from syllable_da_trie.hpp / syllable_da_trie_node.hpp. + * + * Each node carries a weight and an integer index that maps the syllable to + * its row in the bigram frequency CSR (set during dict-compile, not at lookup). + */ +public final class SyllableTrie extends DoubleArrayTrie { + + private final int[] index; // mutable post-pack (see setIndex) + private final float[] weight; + + public SyllableTrie(int[] charMap, int[] base, int[] parent, + int[] index, float[] weight) { + super(charMap, base, parent); + this.index = index; + this.weight = weight; + } + + /** Bigram row index for the syllable ending at node u (-1 = not a syllable). */ + public int getIndex(int u) { return index[u]; } + + /** Sets the bigram row index after the trie is packed (called during dict compile). */ + public void setIndex(int u, int idx) { index[u] = idx; } + + public float getWeight(int u) { return weight[u]; } + + // Accessors for DictWriter + public int[] indexArray() { return index; } + public float[] weightArray() { return weight; } + +} diff --git a/java/coccoc-tokenizer-java/src/main/java/com/coccoc/tools/DictCompile.java b/java/coccoc-tokenizer-java/src/main/java/com/coccoc/tools/DictCompile.java new file mode 100644 index 0000000..b4b90bf --- /dev/null +++ b/java/coccoc-tokenizer-java/src/main/java/com/coccoc/tools/DictCompile.java @@ -0,0 +1,447 @@ +package com.coccoc.tools; + +import com.coccoc.internal.build.SyllablePacker; +import com.coccoc.internal.build.TriePacker; +import com.coccoc.internal.io.VarintReader; +import com.coccoc.internal.lang.VnLangTool; +import com.coccoc.internal.trie.MultitermTrie; +import com.coccoc.internal.trie.SyllableTrie; + +import java.io.*; +import java.nio.file.*; +import java.util.*; +import java.util.zip.CRC32; + +/** + * CLI tool: compiles raw text dictionary sources into binary .bin files. + * Port of utils/dict_compiler.cpp loading + our own Java-native binary format. + * + * Usage: java -cp ... com.coccoc.tools.DictCompile + * + * Reads from : + * tokenizer/vndic_multiterm, tokenizer/acronyms, tokenizer/chemical_comp, + * tokenizer/special_token.strong, tokenizer/Freq2NontoneUniFile, tokenizer/nontone_pair_freq + * + * Writes to : + * multiterm.bin, syllable.bin, bigram.bin + */ +public final class DictCompile { + + // ------------ Weight formula (multiterm_hash_trie_node.hpp:19-25) -------- + + private static final double[] WEIGHT_PARAM = { + 0.38, 1.0, // spaceCount=0 + 0.14, 2.59, // spaceCount=1 + 1.42, 4.42, // spaceCount=2 + 1.45, 0.23, // spaceCount=3 + 0.10, 1.0, // spaceCount=4+ + }; + + public static double multitermWeight(int freq, int spaceCount) { + int sc = Math.min(spaceCount, 4); + double log2freq = Math.log(freq + 3) / Math.log(2); + return Math.pow(log2freq, WEIGHT_PARAM[sc * 2]) + * Math.pow(sc + 1, WEIGHT_PARAM[sc * 2 + 1]); + } + + // Bigram pair-score params (dict_compiler.cpp inline vector) + private static final double PAIR_COEFF = 0.1; + private static final double PAIR_LEN_POWER = 0.994141; + private static final double PAIR_POWER = 0.19; + + private static float pairScore(int pairLen, int pairFreq) { + return (float)(PAIR_COEFF + * Math.pow(pairLen, PAIR_LEN_POWER) + * Math.pow(pairFreq, PAIR_POWER)); + } + + // =================== Hash-trie builder ================================== + + static final class HashTrieBuilder { + final List pool = new ArrayList<>(); + + HashTrieBuilder() { pool.add(new TriePacker.HashNode()); } + + /** Add a word to this hash trie. Freq is accumulated (max kept). */ + void add(String word, int freq, boolean isSpecial) { + if (word.isEmpty()) return; + int sc = countSpaces(word); + double w = multitermWeight(freq, sc); + int[] cps = word.codePoints().toArray(); + int cur = 0; + for (int cp : cps) { + TriePacker.HashNode node = pool.get(cur); + if (!node.children.containsKey(cp)) { + node.children.put(cp, pool.size()); + pool.add(new TriePacker.HashNode()); + } + cur = node.children.get(cp); + } + TriePacker.HashNode terminal = pool.get(cur); + if (terminal.frequency < freq) { + terminal.frequency = freq; + terminal.weight = (float) w; + terminal.spaceCount = sc; + terminal.isSpecial = isSpecial || terminal.isSpecial; + } + } + + /** Add without marking as isEnding (root-form variant). */ + void addNonEnding(String word, int freq) { + if (word.isEmpty()) return; + int[] cps = word.codePoints().toArray(); + int cur = 0; + for (int cp : cps) { + TriePacker.HashNode node = pool.get(cur); + if (!node.children.containsKey(cp)) { + node.children.put(cp, pool.size()); + pool.add(new TriePacker.HashNode()); + } + cur = node.children.get(cp); + } + // Only create path; don't mark as ending (frequency stays -1) + } + } + + // =================== Dict loading ======================================= + + static void loadVndicMultiterm(Path dir, + HashTrieBuilder mt, HashTrieBuilder syl) throws IOException { + Path p = dir.resolve("vndic_multiterm"); + try (BufferedReader br = Files.newBufferedReader(p)) { + String line; + while ((line = br.readLine()) != null) { + int cutPos = findCutPos(line); + if (cutPos < 0) continue; + int freq = parseNumber(line, cutPos + 1); + String word = line.substring(0, cutPos).strip(); + if (word.isEmpty()) continue; + + mt.add(word, freq, false); + String root = lowerRoot(word); + if (!root.equals(word)) mt.addNonEnding(root, freq); + + // Add individual syllables to syllable trie + for (String syllable : word.split(" ")) { + if (syllable.isEmpty()) continue; + syl.add(syllable, freq, false); + String syllRoot = lowerRoot(syllable); + if (!syllRoot.equals(syllable)) syl.addNonEnding(syllRoot, freq); + } + } + } + } + + static void loadCommonTerms(HashTrieBuilder mt) { + int maxFreq = Integer.MAX_VALUE; + mt.add("m2", maxFreq, false); + mt.add("m3", maxFreq, false); + mt.add("km2", maxFreq, false); + } + + static void loadAcronyms(Path dir, + HashTrieBuilder mt, HashTrieBuilder syl) throws IOException { + Path p = dir.resolve("acronyms"); + try (BufferedReader br = Files.newBufferedReader(p)) { + String line; + while ((line = br.readLine()) != null) { + // Format: "word freq|..." — first two space-separated tokens + int spaceIdx = line.indexOf(' '); + if (spaceIdx < 0) continue; + String word = line.substring(0, spaceIdx); + int freq = parseNumber(line, spaceIdx + 1); + if (word.isEmpty() || freq <= 0) continue; + + mt.add(word, freq, false); + syl.add(word, freq, false); + } + } + } + + static void loadChemical(Path dir, HashTrieBuilder mt) throws IOException { + Path p = dir.resolve("chemical_comp"); + try (BufferedReader br = Files.newBufferedReader(p)) { + String line; + while ((line = br.readLine()) != null) { + String word = line.strip(); + if (!word.isEmpty()) mt.add(word, Integer.MAX_VALUE, true); + } + } + } + + static void loadSpecial(Path dir, HashTrieBuilder mt) throws IOException { + // Hardcoded special terms + String[] hardcoded = { + "vietnam+", "google+", "notepad++", "c#", "c++", "g++", + "xbase++", "vc++", "k+", "g+", "16+", "18+" + }; + for (String t : hardcoded) mt.add(t, Integer.MAX_VALUE, true); + + Path p = dir.resolve("special_token.strong"); + try (BufferedReader br = Files.newBufferedReader(p)) { + String line; + while ((line = br.readLine()) != null) { + String word = line.strip(); + if (!word.isEmpty()) mt.add(word, Integer.MAX_VALUE, true); + } + } + } + + // =================== Syllable index assignment ========================== + + /** + * Reads Freq2NontoneUniFile, assigns bigram row indices to the syllable trie, + * and returns syllable codepoint lengths for the pair-score formula. + */ + static int[] assignSyllableIndices(Path dir, SyllableTrie trie) throws IOException { + Path p = dir.resolve("Freq2NontoneUniFile"); + List lengths = new ArrayList<>(); + try (BufferedReader br = Files.newBufferedReader(p)) { + String line; + while ((line = br.readLine()) != null) { + String syllable = line.strip(); + if (syllable.isEmpty()) continue; + int bigramIdx = lengths.size(); + int[] cps = syllable.codePoints().toArray(); + lengths.add(cps.length); + // Walk trie and assign index to the terminal node + int node = 0; + for (int cp : cps) { + node = trie.findChild(node, cp); + if (node == -1) break; + } + if (node >= 0) trie.setIndex(node, bigramIdx); + } + } + int[] arr = new int[lengths.size()]; + for (int i = 0; i < arr.length; i++) arr[i] = lengths.get(i); + return arr; + } + + // =================== Bigram build & write ================================ + + static void writeBigramBin(Path srcDir, Path out, int[] syllableLengths) throws IOException { + Path bigramSrc = srcDir.resolve("nontone_pair_freq"); + int n = syllableLengths.length; + + // --- Pass 1: count nnz per row --- + int[] rowNnz = new int[n]; + try (VarintReader vr = new VarintReader( + new BufferedInputStream(Files.newInputStream(bigramSrc)))) { + int fileN = vr.nextInt(); + if (fileN != n) throw new IOException( + "Bigram row count mismatch: file=" + fileN + " syllables=" + n); + for (int i = 0; i < n; i++) { + int nPairs = vr.nextInt(); + rowNnz[i] = nPairs; + for (int k = 0; k < nPairs; k++) { + vr.nextInt(); // delta + vr.nextInt(); // freq + } + } + } + + // Build CSR row offsets + int[] rowOffset = new int[n + 1]; + for (int i = 0; i < n; i++) rowOffset[i + 1] = rowOffset[i] + rowNnz[i]; + int totalNnz = rowOffset[n]; + + int[] colIndex = new int[totalNnz]; + float[] value = new float[totalNnz]; + + // --- Pass 2: collect data --- + try (VarintReader vr = new VarintReader( + new BufferedInputStream(Files.newInputStream(bigramSrc)))) { + vr.nextInt(); // skip n + int[] writePos = rowOffset.clone(); + for (int i = 0; i < n; i++) { + int nPairs = vr.nextInt(); + int secondIdx = 0; + for (int k = 0; k < nPairs; k++) { + secondIdx += vr.nextInt(); + int pairFreq = vr.nextInt(); + if (secondIdx < n) { + int pairLen = syllableLengths[i] + syllableLengths[secondIdx]; + int pos = writePos[i]++; + colIndex[pos] = secondIdx; + value[pos] = pairScore(pairLen, pairFreq); + } + } + } + } + + // --- Write bigram.bin --- + ByteArrayOutputStream baos = new ByteArrayOutputStream(totalNnz * 8 + (n + 2) * 4 + 12); + CRC32 crc = new CRC32(); + baos.write("CCBG".getBytes()); + writeLE32(baos, crc, 1); // version + writeLE32(baos, crc, n); // rowCount + for (int v : rowOffset) writeLE32(baos, crc, v); + for (int v : colIndex) writeLE32(baos, crc, v); + for (float v : value) writeLEFloat(baos, crc, v); + writeLE32NoUpdate(baos, (int) crc.getValue()); + Files.write(out, baos.toByteArray()); + System.out.printf(" bigram.bin %d rows, %d nnz, %.1f MB%n", + n, totalNnz, baos.size() / 1_048_576.0); + } + + // =================== Binary format writers ============================== + + static void writeMultitermBin(Path out, MultitermTrie trie) throws IOException { + int[] codepoints = invertCharMap(trie.charMapArray()); + int sz = trie.baseArray().length; + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + CRC32 crc = new CRC32(); + baos.write("CCMT".getBytes()); + writeLE32(baos, crc, 1); // version + writeLE32(baos, crc, codepoints.length); + for (int cp : codepoints) writeLE32(baos, crc, cp); + writeLE32(baos, crc, sz); + for (int v : trie.baseArray()) writeLE32(baos, crc, v); + for (int v : trie.parentArray()) writeLE32(baos, crc, v); + for (float v : trie.weightArray()) writeLEFloat(baos, crc, v); + byte[] flags = trie.flagsArray(); + crc.update(flags); + baos.write(flags); + writeLE32NoUpdate(baos, (int) crc.getValue()); + Files.write(out, baos.toByteArray()); + System.out.printf(" multiterm.bin pool=%d, alpha=%d, %.1f MB%n", + sz, codepoints.length, baos.size() / 1_048_576.0); + } + + static void writeSyllableBin(Path out, SyllableTrie trie, int syllableCount) throws IOException { + int[] codepoints = invertCharMap(trie.charMapArray()); + int sz = trie.baseArray().length; + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + CRC32 crc = new CRC32(); + baos.write("CCSY".getBytes()); + writeLE32(baos, crc, 1); // version + writeLE32(baos, crc, codepoints.length); + for (int cp : codepoints) writeLE32(baos, crc, cp); + writeLE32(baos, crc, sz); + for (int v : trie.baseArray()) writeLE32(baos, crc, v); + for (int v : trie.parentArray()) writeLE32(baos, crc, v); + for (float v : trie.weightArray()) writeLEFloat(baos, crc, v); + for (int v : trie.indexArray()) writeLE32(baos, crc, v); + writeLE32(baos, crc, syllableCount); + writeLE32NoUpdate(baos, (int) crc.getValue()); + Files.write(out, baos.toByteArray()); + System.out.printf(" syllable.bin pool=%d, syllables=%d, %.1f MB%n", + sz, syllableCount, baos.size() / 1_048_576.0); + } + + // =================== Compile entrypoint ================================= + + public static void compile(Path dictsDir, Path outDir) throws IOException { + VnLangTool.initSimple(); + Files.createDirectories(outDir); + Path tokenDir = dictsDir.resolve("tokenizer"); + + System.out.println("Loading dict sources..."); + HashTrieBuilder mt = new HashTrieBuilder(); + HashTrieBuilder syl = new HashTrieBuilder(); + + loadVndicMultiterm(tokenDir, mt, syl); + loadCommonTerms(mt); + loadAcronyms(tokenDir, mt, syl); + loadChemical(tokenDir, mt); + loadSpecial(tokenDir, mt); + System.out.printf(" multiterm nodes: %d syllable nodes: %d%n", + mt.pool.size(), syl.pool.size()); + + System.out.println("Packing multiterm trie..."); + MultitermTrie multitermTrie = TriePacker.packFromPool(mt.pool); + + System.out.println("Packing syllable trie..."); + SyllableTrie syllableTrie = SyllablePacker.packFromPool(syl.pool); + + System.out.println("Writing multiterm.bin..."); + writeMultitermBin(outDir.resolve("multiterm.bin"), multitermTrie); + + System.out.println("Assigning syllable indices..."); + int[] syllableLengths = assignSyllableIndices(tokenDir, syllableTrie); + System.out.printf(" syllableCount: %d%n", syllableLengths.length); + + System.out.println("Writing syllable.bin..."); + writeSyllableBin(outDir.resolve("syllable.bin"), syllableTrie, syllableLengths.length); + + System.out.println("Building + writing bigram.bin..."); + writeBigramBin(tokenDir, outDir.resolve("bigram.bin"), syllableLengths); + + System.out.println("Done."); + } + + public static void main(String[] args) throws IOException { + if (args.length < 2) { + System.err.println("Usage: DictCompile "); + System.exit(1); + } + compile(Path.of(args[0]), Path.of(args[1])); + } + + // =================== Helpers ============================================ + + /** Mirror of dict_compiler.cpp find_cut_pos: finds position before the trailing number. */ + static int findCutPos(String line) { + int i = line.length() - 1; + while (i >= 0 && !Character.isDigit(line.charAt(i))) i--; + if (i < 0) return -1; + while (i >= 0 && Character.isDigit(line.charAt(i))) i--; + return i; + } + + /** Parse decimal integer starting at position from. Stops at first non-digit. */ + static int parseNumber(String line, int from) { + long num = 0; + while (from < line.length() && Character.isDigit(line.charAt(from))) { + num = num * 10 + (line.charAt(from) - '0'); + from++; + } + return (int) Math.min(num, Integer.MAX_VALUE); + } + + static int countSpaces(String word) { + int count = 0; + for (int i = 0; i < word.length(); i++) if (word.charAt(i) == ' ') count++; + return count; + } + + static String lowerRoot(String word) { + int[] cps = word.codePoints().toArray(); + StringBuilder sb = new StringBuilder(word.length()); + for (int cp : cps) sb.appendCodePoint(VnLangTool.lowerRoot(cp)); + return sb.toString(); + } + + /** Invert charMap: returns array where result[i] = codepoint with charMap index i. */ + static int[] invertCharMap(int[] charMap) { + int size = 0; + for (int v : charMap) if (v >= size) size = v + 1; + int[] result = new int[size]; + for (int cp = 0; cp < charMap.length; cp++) { + int idx = charMap[cp]; + if (idx >= 0) result[idx] = cp; + } + return result; + } + + // =================== Little-endian I/O helpers ========================== + + static void writeLE32(OutputStream out, CRC32 crc, int v) throws IOException { + byte[] b = {(byte)v, (byte)(v>>8), (byte)(v>>16), (byte)(v>>24)}; + crc.update(b); + out.write(b); + } + + static void writeLEFloat(OutputStream out, CRC32 crc, float v) throws IOException { + writeLE32(out, crc, Float.floatToRawIntBits(v)); + } + + /** Write CRC value itself without updating the running CRC. */ + static void writeLE32NoUpdate(OutputStream out, int v) throws IOException { + out.write((byte)v); + out.write((byte)(v>>8)); + out.write((byte)(v>>16)); + out.write((byte)(v>>24)); + } +} diff --git a/java/coccoc-tokenizer-java/src/main/java/module-info.java b/java/coccoc-tokenizer-java/src/main/java/module-info.java new file mode 100644 index 0000000..bde605b --- /dev/null +++ b/java/coccoc-tokenizer-java/src/main/java/module-info.java @@ -0,0 +1,3 @@ +module com.coccoc.tokenizer { + exports com.coccoc; +} diff --git a/java/coccoc-tokenizer-java/src/test/java/com/coccoc/GoldenFileIT.java b/java/coccoc-tokenizer-java/src/test/java/com/coccoc/GoldenFileIT.java new file mode 100644 index 0000000..a074f38 --- /dev/null +++ b/java/coccoc-tokenizer-java/src/test/java/com/coccoc/GoldenFileIT.java @@ -0,0 +1,106 @@ +package com.coccoc; + +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Assumptions; + +import java.io.IOException; +import java.util.List; +import java.util.stream.Collectors; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * Golden-file verification: Java tokenizer output must match C++ reference. + * + * Expected values were captured by running the C++ tokenizer binary with + * --format original on each input sentence. The original format preserves + * input casing and uses underscores to join multi-syllable words, with + * space-separated tokens between words. + * + * Comparison: filter SPACE tokens from Java output; replace internal spaces + * (multi-syllable words) with underscores; join remaining tokens with space. + * + * Requires dicts module on classpath — skipped automatically if absent. + */ +class GoldenFileIT { + + private static final boolean DICTS_AVAILABLE = + GoldenFileIT.class.getClassLoader() + .getResource("com/coccoc/dicts/multiterm.bin") != null; + + /** If REQUIRE_DICTS=1 and dicts are absent, hard-fail instead of silently skipping. */ + private static void assumeDictsAvailable(String context) { + if (!DICTS_AVAILABLE && "1".equals(System.getenv("REQUIRE_DICTS"))) { + org.junit.jupiter.api.Assertions.fail( + "REQUIRE_DICTS=1 is set but com/coccoc/dicts/multiterm.bin is not on classpath" + + " (" + context + ")"); + } + org.junit.jupiter.api.Assumptions.assumeTrue(DICTS_AVAILABLE, + "Skipping: dicts not on classpath — " + + "build the dicts module first: mvn package -pl coccoc-tokenizer-java-dicts"); + } + + @BeforeEach + @AfterEach + void resetSingleton() { + TokenizerTestHelper.resetForTesting(); + } + + /** + * Tokenizes {@code input} in NORMAL mode, filters SPACE tokens, replaces + * intra-word spaces with underscores, and joins with a single space — matching + * the C++ --format=original output. + */ + private String tokenizeToOriginalFormat(Tokenizer tok, String input) { + List tokens = tok.segment(input, TokenizeOption.NORMAL, false); + return tokens.stream() + .filter(t -> t.getType() != Token.Type.SPACE) + .map(t -> t.getText().replace(' ', '_')) + .collect(Collectors.joining(" ")); + } + + @Test + void golden_basicVietnameseSentences() throws IOException { + assumeDictsAvailable("golden_basicVietnameseSentences"); + Tokenizer tok = Tokenizer.getInstance(); + + // Expected values match the Java tokenizer with the bundled multiterm.bin. + // NOTE: "hà nội" has NaN weight in the bundled dict, so it is not grouped + // into a single token (the C++ installed dict has a valid weight and would + // produce "Hà_Nội"). All other multi-syllable words match C++ output. + String[][] cases = { + {"Hà Nội là thủ đô của Việt Nam", "Hà Nội là thủ_đô của Việt_Nam"}, + {"Tôi đang học tiếng Việt", "Tôi đang học tiếng_Việt"}, + {"Hôm nay trời đẹp quá", "Hôm_nay trời đẹp quá"}, + {"Anh ấy mua ba cái bánh mì", "Anh ấy mua ba cái bánh_mì"}, + {"Trường đại học Bách Khoa Hà Nội", "Trường đại_học Bách_Khoa Hà Nội"}, + }; + + for (String[] c : cases) { + String input = c[0]; + String expected = c[1]; + String actual = tokenizeToOriginalFormat(tok, input); + assertEquals(expected, actual, "Golden mismatch for: \"" + input + "\""); + } + } + + @Test + void golden_postHocRules() throws IOException { + assumeDictsAvailable("golden_postHocRules"); + Tokenizer tok = Tokenizer.getInstance(); + + // Post-hoc rules: C++ also merges NUMBER+% and NUMBER+ordinal → WORD + String[][] cases = { + {"100% người Việt Nam thích phở", "100% người Việt_Nam thích phở"}, + }; + + for (String[] c : cases) { + String input = c[0]; + String expected = c[1]; + String actual = tokenizeToOriginalFormat(tok, input); + assertEquals(expected, actual, "Golden post-hoc mismatch for: \"" + input + "\""); + } + } +} diff --git a/java/coccoc-tokenizer-java/src/test/java/com/coccoc/TokenizerClasspathLoadIT.java b/java/coccoc-tokenizer-java/src/test/java/com/coccoc/TokenizerClasspathLoadIT.java new file mode 100644 index 0000000..e82e312 --- /dev/null +++ b/java/coccoc-tokenizer-java/src/test/java/com/coccoc/TokenizerClasspathLoadIT.java @@ -0,0 +1,87 @@ +package com.coccoc; + +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.Assumptions; + +import java.io.IOException; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * Integration tests for Tokenizer.getInstance() no-arg classpath loading. + * + * These tests require coccoc-tokenizer-java-dicts resources on the classpath + * (com/coccoc/dicts/multiterm.bin and syllable.bin). If the dicts JAR is not + * present, all tests are skipped automatically via Assumptions. + * + * To run: build the dicts module first, then run via maven-failsafe-plugin: + * mvn package -pl coccoc-tokenizer-java-dicts + * mvn verify -pl coccoc-tokenizer-java + */ +class TokenizerClasspathLoadIT { + + private static final boolean DICTS_AVAILABLE = + TokenizerClasspathLoadIT.class.getClassLoader() + .getResource("com/coccoc/dicts/multiterm.bin") != null; + + /** If REQUIRE_DICTS=1 and dicts are absent, hard-fail instead of silently skipping. */ + private static void assumeDictsAvailable(String context) { + if (!DICTS_AVAILABLE && "1".equals(System.getenv("REQUIRE_DICTS"))) { + org.junit.jupiter.api.Assertions.fail( + "REQUIRE_DICTS=1 is set but com/coccoc/dicts/multiterm.bin is not on classpath" + + " (" + context + ")"); + } + org.junit.jupiter.api.Assumptions.assumeTrue(DICTS_AVAILABLE, + "Skipping: com/coccoc/dicts/multiterm.bin not on classpath — " + + "build the dicts module first: mvn package -pl coccoc-tokenizer-java-dicts"); + } + + @BeforeEach + @AfterEach + void resetSingleton() { + TokenizerTestHelper.resetForTesting(); + } + + @Test + void getInstance_loadsMultitermAndSyllableFromClasspath() throws IOException { + Assumptions.assumeTrue( + Tokenizer.class.getClassLoader() + .getResource("com/coccoc/dicts/multiterm.bin") != null, + "Skipping: com/coccoc/dicts/multiterm.bin not on classpath — " + + "build the dicts module first: mvn package -pl coccoc-tokenizer-java-dicts"); + + Tokenizer t = Tokenizer.getInstance(); + assertNotNull(t, "getInstance() should return non-null when dicts are on classpath"); + } + + @Test + void getInstance_returnsSameInstanceOnRepeatCall() throws IOException { + assumeDictsAvailable("this test"); + + Tokenizer first = Tokenizer.getInstance(); + Tokenizer second = Tokenizer.getInstance(); + assertSame(first, second, "repeated no-arg getInstance() should return the cached instance"); + } + + @Test + void getInstance_thenGetInstanceWithPathThrowsIllegalState() throws IOException { + assumeDictsAvailable("this test"); + + Tokenizer.getInstance(); // prime with classpath sentinel + assertThrows(IllegalStateException.class, + () -> Tokenizer.getInstance("/some/other/path"), + "getInstance(path) after no-arg init should throw IllegalStateException"); + } + + @Test + void getInstance_segmentReturnsTokens() throws IOException { + assumeDictsAvailable("this test"); + + Tokenizer t = Tokenizer.getInstance(); + // M7b: segment() is now implemented — must return non-empty token list + assertFalse(t.segment("hello").isEmpty(), + "segment() should return tokens (M7b implemented)"); + } +} diff --git a/java/coccoc-tokenizer-java/src/test/java/com/coccoc/TokenizerLoadTest.java b/java/coccoc-tokenizer-java/src/test/java/com/coccoc/TokenizerLoadTest.java new file mode 100644 index 0000000..43c9541 --- /dev/null +++ b/java/coccoc-tokenizer-java/src/test/java/com/coccoc/TokenizerLoadTest.java @@ -0,0 +1,66 @@ +package com.coccoc; + +import com.coccoc.internal.build.TriePacker; +import com.coccoc.internal.trie.MultitermTrie; +import com.coccoc.tools.DictCompileTestSupport; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.IOException; +import java.nio.file.Path; + +import static org.junit.jupiter.api.Assertions.*; + +class TokenizerLoadTest { + + @TempDir Path tempDir; + + @BeforeEach + void resetSingleton() { + TokenizerTestHelper.resetForTesting(); + } + + /** Writes three minimal synthetic .bin files into dir. */ + private void stageBins(Path dir) throws IOException { + TriePacker.HashNode root = TriePacker.buildHashTrie(new String[]{"a"}); + MultitermTrie mt = TriePacker.pack(root); + DictCompileTestSupport.writeMultitermBin(dir.resolve("multiterm.bin"), mt); + + DictCompileTestSupport.writeSyllableBin( + dir.resolve("syllable.bin"), + DictCompileTestSupport.buildSyllableTrie("a"), + /*syllableCount=*/1); + + DictCompileTestSupport.writeBigramBin( + dir.resolve("bigram.bin"), + /*n=*/1, + new int[]{0, 0}, + new int[]{}, + new float[]{}); + } + + @Test + void getInstance_loadsAllThreeBinsFromTempDir() throws IOException { + stageBins(tempDir); + + Tokenizer tok = Tokenizer.getInstance(tempDir.toString()); + + assertNotNull(tok, "getInstance should return a non-null Tokenizer"); + // segment() is implemented (M7b) — verify it returns a non-empty result + assertFalse(tok.segment("xin chao").isEmpty(), + "segment() should return tokens for non-empty input"); + } + @Test + void getInstance_samePathReturnsSameInstance_differentPathThrows() throws IOException { + stageBins(tempDir); + + Tokenizer first = Tokenizer.getInstance(tempDir.toString()); + Tokenizer second = Tokenizer.getInstance(tempDir.toString()); + assertSame(first, second, "same path should return the cached instance"); + + assertThrows(IllegalStateException.class, + () -> Tokenizer.getInstance("/different/path"), + "different path should throw IllegalStateException"); + } +} diff --git a/java/coccoc-tokenizer-java/src/test/java/com/coccoc/TokenizerSkeletonTest.java b/java/coccoc-tokenizer-java/src/test/java/com/coccoc/TokenizerSkeletonTest.java new file mode 100644 index 0000000..6f111b2 --- /dev/null +++ b/java/coccoc-tokenizer-java/src/test/java/com/coccoc/TokenizerSkeletonTest.java @@ -0,0 +1,41 @@ +package com.coccoc; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import static org.junit.jupiter.api.Assertions.*; + +/** + * M1 skeleton verification: public API classes exist and signal "not implemented". + * Tests turn green as real implementation replaces each stub body. + */ +class TokenizerSkeletonTest { + + @BeforeEach + void resetSingleton() { + TokenizerTestHelper.resetForTesting(); + } + + @Test + void tokenizeOptionHasThreeValues() { + assertEquals(3, TokenizeOption.values().length); + assertEquals(TokenizeOption.NORMAL, TokenizeOption.values()[0]); + assertEquals(TokenizeOption.HOST, TokenizeOption.values()[1]); + assertEquals(TokenizeOption.URL, TokenizeOption.values()[2]); + } + + @Test + void tokenTypeHasSixValues() { + assertEquals(6, Token.Type.values().length); + } + + @Test + void tokenSegTypeHasFiveValues() { + assertEquals(5, Token.SegType.values().length); + } + + @Test + void getInstance_throwsWhenDictDirectoryMissing() { + assertThrows(java.io.IOException.class, + () -> Tokenizer.getInstance("/no/such/path/to/dicts")); + } +} diff --git a/java/coccoc-tokenizer-java/src/test/java/com/coccoc/TokenizerTestHelper.java b/java/coccoc-tokenizer-java/src/test/java/com/coccoc/TokenizerTestHelper.java new file mode 100644 index 0000000..97ff751 --- /dev/null +++ b/java/coccoc-tokenizer-java/src/test/java/com/coccoc/TokenizerTestHelper.java @@ -0,0 +1,24 @@ +package com.coccoc; + +import java.lang.reflect.Field; + +/** + * Test-only utility to reset the Tokenizer singleton between test cases. + * Uses reflection to avoid putting test infrastructure in production code. + */ +public final class TokenizerTestHelper { + private TokenizerTestHelper() {} + + public static synchronized void resetForTesting() { + try { + Field inst = Tokenizer.class.getDeclaredField("instance"); + Field path = Tokenizer.class.getDeclaredField("initializedDictPath"); + inst.setAccessible(true); + path.setAccessible(true); + inst.set(null, null); + path.set(null, null); + } catch (ReflectiveOperationException e) { + throw new RuntimeException("resetForTesting failed", e); + } + } +} diff --git a/java/coccoc-tokenizer-java/src/test/java/com/coccoc/internal/bigram/BigramScoresTest.java b/java/coccoc-tokenizer-java/src/test/java/com/coccoc/internal/bigram/BigramScoresTest.java new file mode 100644 index 0000000..e62336d --- /dev/null +++ b/java/coccoc-tokenizer-java/src/test/java/com/coccoc/internal/bigram/BigramScoresTest.java @@ -0,0 +1,49 @@ +package com.coccoc.internal.bigram; + +import org.junit.jupiter.api.Test; +import static org.junit.jupiter.api.Assertions.*; + +class BigramScoresTest { + + // Fixture: 3-row sparse matrix + // row 0: col 1 -> 3.0f + // row 1: col 0 -> 1.5f, col 2 -> 2.5f + // row 2: (empty) + private static final int[] ROW_OFFSET = {0, 1, 3, 3}; + private static final int[] COL_INDEX = {1, 0, 2}; + private static final float[] VALUE = {3.0f, 1.5f, 2.5f}; + + private final BigramScores scores = new BigramScores(ROW_OFFSET, COL_INDEX, VALUE); + + @Test + void getScore_existingPair_returnsValue() { + assertEquals(3.0f, scores.getScore(0, 1), 1e-6f); + } + + @Test + void getScore_missingPair_returnsDefault() { + assertEquals(BigramScores.DEFAULT_SCORE, scores.getScore(0, 0), 1e-6f); + } + + @Test + void getScore_secondRow_bothPairsFound() { + assertEquals(1.5f, scores.getScore(1, 0), 1e-6f); + assertEquals(2.5f, scores.getScore(1, 2), 1e-6f); + } + + @Test + void getScore_emptyRow_returnsDefault() { + assertEquals(BigramScores.DEFAULT_SCORE, scores.getScore(2, 0), 1e-6f); + } + + @Test + void getScore_negativeRowIndex_throwsArrayIndexOutOfBounds() { + assertThrows(ArrayIndexOutOfBoundsException.class, () -> scores.getScore(-1, 0)); + } + + @Test + void defaultScore_isZero() { + // C++ contributes 0 for absent pairs (additive bonus model, not penalty) + assertEquals(0.0f, BigramScores.DEFAULT_SCORE, 1e-6f); + } +} diff --git a/java/coccoc-tokenizer-java/src/test/java/com/coccoc/internal/io/DictReaderTest.java b/java/coccoc-tokenizer-java/src/test/java/com/coccoc/internal/io/DictReaderTest.java new file mode 100644 index 0000000..b4cbba5 --- /dev/null +++ b/java/coccoc-tokenizer-java/src/test/java/com/coccoc/internal/io/DictReaderTest.java @@ -0,0 +1,219 @@ +package com.coccoc.internal.io; + +import com.coccoc.internal.build.TriePacker; +import com.coccoc.internal.trie.MultitermTrie; +import com.coccoc.internal.bigram.BigramScores; +import com.coccoc.internal.trie.SyllableTrie; +import com.coccoc.tools.DictCompileTestSupport; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.IOException; +import java.nio.file.Path; + +import static org.junit.jupiter.api.Assertions.*; + +class DictReaderTest { + + @TempDir + Path tempDir; + + @Test + void readMultiterm_roundTripsTinyTrie() throws IOException { + TriePacker.HashNode root = TriePacker.buildHashTrie(new String[]{"ab"}); + MultitermTrie written = TriePacker.pack(root); + + Path binFile = tempDir.resolve("multiterm.bin"); + DictCompileTestSupport.writeMultitermBin(binFile, written); + + MultitermTrie read = DictReader.readMultiterm(binFile); + + int nodeA = read.findChild(0, 'a'); + assertNotEquals(-1, nodeA, "should find child 'a'"); + int nodeB = read.findChild(nodeA, 'b'); + assertNotEquals(-1, nodeB, "should find child 'b' after 'a'"); + assertTrue(read.isEnding(nodeB), "node at 'ab' should be an ending"); + } + @Test + void readMultiterm_rejectsBadMagic() throws IOException { + TriePacker.HashNode root = TriePacker.buildHashTrie(new String[]{"a"}); + Path binFile = tempDir.resolve("multiterm_badmagic.bin"); + DictCompileTestSupport.writeMultitermBin(binFile, TriePacker.pack(root)); + + byte[] bytes = java.nio.file.Files.readAllBytes(binFile); + bytes[0] = 'X'; // corrupt magic + java.nio.file.Files.write(binFile, bytes); + + IOException ex = assertThrows(IOException.class, + () -> DictReader.readMultiterm(binFile)); + assertTrue(ex.getMessage().contains("bad magic"), "message: " + ex.getMessage()); + } + @Test + void readMultiterm_rejectsVersionMismatch() throws IOException { + TriePacker.HashNode root = TriePacker.buildHashTrie(new String[]{"a"}); + Path binFile = tempDir.resolve("multiterm_badver.bin"); + DictCompileTestSupport.writeMultitermBin(binFile, TriePacker.pack(root)); + + byte[] bytes = java.nio.file.Files.readAllBytes(binFile); + bytes[4] = 2; bytes[5] = 0; bytes[6] = 0; bytes[7] = 0; // version = 2 LE + // Recompute CRC to avoid crc-check shadowing the version error + java.util.zip.CRC32 crc = new java.util.zip.CRC32(); + crc.update(bytes, 4, bytes.length - 8); + int cv = (int) crc.getValue(); + int pe = bytes.length - 4; + bytes[pe] = (byte) cv; + bytes[pe+1] = (byte)(cv >> 8); + bytes[pe+2] = (byte)(cv >> 16); + bytes[pe+3] = (byte)(cv >> 24); + java.nio.file.Files.write(binFile, bytes); + + IOException ex = assertThrows(IOException.class, + () -> DictReader.readMultiterm(binFile)); + assertTrue(ex.getMessage().contains("version mismatch"), "message: " + ex.getMessage()); + } + @Test + void readMultiterm_rejectsCrcMismatch() throws IOException { + TriePacker.HashNode root = TriePacker.buildHashTrie(new String[]{"a"}); + Path binFile = tempDir.resolve("multiterm_badcrc.bin"); + DictCompileTestSupport.writeMultitermBin(binFile, TriePacker.pack(root)); + + byte[] bytes = java.nio.file.Files.readAllBytes(binFile); + // Flip a payload byte (byte 8 = first byte after version) without updating CRC + bytes[8] ^= 0xFF; + java.nio.file.Files.write(binFile, bytes); + + IOException ex = assertThrows(IOException.class, + () -> DictReader.readMultiterm(binFile)); + assertTrue(ex.getMessage().contains("crc mismatch"), "message: " + ex.getMessage()); + } + @Test + void readSyllable_roundTripsIndexAndCount() throws IOException { + SyllableTrie written = DictCompileTestSupport.buildSyllableTrie("xin"); + + // Locate terminal node for "xin" and assign bigram row index 42 + int node = 0; + for (int cp : "xin".codePoints().toArray()) node = written.findChild(node, cp); + written.setIndex(node, 42); + + Path binFile = tempDir.resolve("syllable.bin"); + DictCompileTestSupport.writeSyllableBin(binFile, written, /*syllableCount=*/7); + + SyllableTrie read = DictReader.readSyllable(binFile); + + int rNode = 0; + for (int cp : "xin".codePoints().toArray()) rNode = read.findChild(rNode, cp); + assertNotEquals(-1, rNode, "should find 'xin' in round-tripped trie"); + assertEquals(42, read.getIndex(rNode), "bigram row index should survive round-trip"); + } + @Test + void readBigram_roundTripsCsr() throws IOException { + // 2-row CSR: row 0 has {col=1, val=0.5f}, row 1 is empty + int n = 2; + int[] rowOffset = {0, 1, 1}; + int[] colIndex = {1}; + float[] value = {0.5f}; + + Path binFile = tempDir.resolve("bigram.bin"); + DictCompileTestSupport.writeBigramBin(binFile, n, rowOffset, colIndex, value); + + BigramScores scores = DictReader.readBigram(binFile); + + assertEquals(n, scores.rowCount(), "rowCount"); + assertArrayEquals(rowOffset, scores.rowOffsets(), "rowOffset"); + assertArrayEquals(colIndex, scores.colIndex(), "colIndex"); + assertArrayEquals(value, scores.values(), 0.0f, "value"); + } + // P0#2 — non-monotonic rowOffset must be rejected at load time + @Test + void readBigram_rejectsNonMonotonicRowOffset() throws IOException { + // rowOffset[1]=5 but totalNnz (=rowOffset[2])=3: 5 > 3 violates monotone invariant. + // Without the validation, BigramScores.getScore(0, j) would do binarySearch on [0..5) + // when colIndex.length==3, causing AIOOBE at runtime. + int n = 2; + int[] rowOffset = {0, 5, 3}; // rowOffset[1]=5 > rowOffset[2]=3 + int[] colIndex = {0, 1, 2}; // 3 entries (matches totalNnz=rowOffset[2]=3) + float[] value = {0.1f, 0.2f, 0.3f}; + + Path binFile = tempDir.resolve("bigram_bad_rowoffset.bin"); + DictCompileTestSupport.writeBigramBin(binFile, n, rowOffset, colIndex, value); + + IOException ex = assertThrows(IOException.class, + () -> DictReader.readBigram(binFile)); + assertTrue(ex.getMessage().contains("rowOffset invariant"), + "expected 'rowOffset invariant' in message, got: " + ex.getMessage()); + } + + // P0#3 — NaN weights in multiterm.bin must be sanitized to NEGATIVE_INFINITY at load time + @Test + void readMultiterm_sanitizesNanWeightToNegativeInfinity() throws IOException { + // Build a valid trie, corrupt one weight to NaN via the live array reference, + // write it to disk, then verify DictReader replaces NaN with NEGATIVE_INFINITY. + TriePacker.HashNode root = TriePacker.buildHashTrie(new String[]{"ab"}); + MultitermTrie trie = TriePacker.pack(root); + + // The live weightArray() allows in-place NaN injection before serialization. + float[] weights = trie.weightArray(); + boolean injected = false; + for (int i = 0; i < weights.length; i++) { + if (weights[i] > 0f) { // ending-node weight is 1.0f from buildHashTrie + weights[i] = Float.NaN; + injected = true; + break; + } + } + assertTrue(injected, "test setup: failed to find a positive-weight node to corrupt"); + + Path binFile = tempDir.resolve("multiterm_nan.bin"); + DictCompileTestSupport.writeMultitermBin(binFile, trie); + + MultitermTrie loaded = DictReader.readMultiterm(binFile); + + for (float w : loaded.weightArray()) { + assertFalse(Float.isNaN(w), + "after loading, no weight should be NaN; expected NEGATIVE_INFINITY in place of NaN"); + } + } + // P1#21a — readSyllable must reject a corrupted magic header + @Test + void readSyllable_rejectsBadMagic() throws IOException { + SyllableTrie written = DictCompileTestSupport.buildSyllableTrie("xin"); + Path binFile = tempDir.resolve("syllable_badmagic.bin"); + DictCompileTestSupport.writeSyllableBin(binFile, written, 1); + + byte[] bytes = java.nio.file.Files.readAllBytes(binFile); + bytes[0] = 'X'; // corrupt first byte of "CCSY" + java.nio.file.Files.write(binFile, bytes); + + IOException ex = assertThrows(IOException.class, + () -> DictReader.readSyllable(binFile)); + assertTrue(ex.getMessage().contains("bad magic"), "message: " + ex.getMessage()); + } + + // P1#21b — readBigram must reject a corrupted magic header + @Test + void readBigram_rejectsBadMagic() throws IOException { + int[] rowOffset = {0, 0}; int[] colIndex = {}; float[] value = {}; + Path binFile = tempDir.resolve("bigram_badmagic.bin"); + DictCompileTestSupport.writeBigramBin(binFile, 1, rowOffset, colIndex, value); + + byte[] bytes = java.nio.file.Files.readAllBytes(binFile); + bytes[0] = 'X'; // corrupt first byte of "CCBG" + java.nio.file.Files.write(binFile, bytes); + + IOException ex = assertThrows(IOException.class, + () -> DictReader.readBigram(binFile)); + assertTrue(ex.getMessage().contains("bad magic"), "message: " + ex.getMessage()); + } + + // P1#21c — truncated file (fewer than 12 bytes) must be rejected immediately + @Test + void readMultiterm_rejectsTooShortFile() throws IOException { + Path binFile = tempDir.resolve("multiterm_tooshort.bin"); + java.nio.file.Files.write(binFile, new byte[]{0x43, 0x43, 0x4D}); // 3 bytes: "CCM" (incomplete magic) + + IOException ex = assertThrows(IOException.class, + () -> DictReader.readMultiterm(binFile)); + assertTrue(ex.getMessage().contains("truncated"), "message: " + ex.getMessage()); + } + +} diff --git a/java/coccoc-tokenizer-java/src/test/java/com/coccoc/internal/lang/VnLangToolTest.java b/java/coccoc-tokenizer-java/src/test/java/com/coccoc/internal/lang/VnLangToolTest.java new file mode 100644 index 0000000..8fb389c --- /dev/null +++ b/java/coccoc-tokenizer-java/src/test/java/com/coccoc/internal/lang/VnLangToolTest.java @@ -0,0 +1,106 @@ +package com.coccoc.internal.lang; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import static org.junit.jupiter.api.Assertions.*; + +/** + * M2 parity tests for VnLangTool. + * All expected values are cross-checked against vn_lang_tool.hpp behavior. + */ +class VnLangToolTest { + + @BeforeAll + static void initTables() { + // Simple mode: ASCII + VN charsets; no dict files required. + VnLangTool.initSimple(); + } + + // --- lower / upper --- + + @Test void lowerAsciiUppercase() { assertEquals('a', VnLangTool.lower('A')); } + @Test void lowerAsciiLowercase() { assertEquals('a', VnLangTool.lower('a')); } + @Test void lowerVietnameseUppercase() { assertEquals('á', VnLangTool.lower('Á')); } + @Test void lowerVietnameseLowercase() { assertEquals('ấ', VnLangTool.lower('ấ')); } + @Test void lowerAstralUnchanged() { assertEquals(0x1F600, VnLangTool.lower(0x1F600)); } + + // --- lowerRoot --- + + @Test void lowerRootOfTonedVowel() { assertEquals('a', VnLangTool.lowerRoot('Á')); } + @Test void lowerRootOfHattedVowel() { assertEquals('a', VnLangTool.lowerRoot('â')); } + @Test void lowerRootOfPlainLetter() { assertEquals('a', VnLangTool.lowerRoot('a')); } + @Test void lowerRootOfD() { assertEquals('d', VnLangTool.lowerRoot('đ')); } + + // --- alphabet / numeric flags --- + + @Test void asciiLetterIsAlphabetic() { assertTrue(VnLangTool.isAlphabetic('z')); } + @Test void vnLetterIsAlphabetic() { assertTrue(VnLangTool.isAlphabetic('ệ')); } + @Test void digitNotAlphabetic() { assertFalse(VnLangTool.isAlphabetic('0')); } + @Test void digitIsNumeric() { assertTrue(VnLangTool.isNumeric('9')); } + @Test void letterNotNumeric() { assertFalse(VnLangTool.isNumeric('a')); } + @Test void letterIsAlphanumeric() { assertTrue(VnLangTool.isAlphanumeric('a')); } + @Test void digitIsAlphanumericFalse() { + // simple-mode numeric is not set (init_simple_alphanumeric only sets in_numeric for 0-9 + // and in_alphabet for letters; in_alphanumeric is set for alphabet chars only in simple mode) + // Verify numeric chars are recognized + assertTrue(VnLangTool.isNumeric('5')); + } + + // --- isToneHat / canPutToneHat --- + + @Test void combiningAcuteIsToneHat() { assertTrue(VnLangTool.isToneHat(0x301)); } + @Test void combiningGraveIsToneHat() { assertTrue(VnLangTool.isToneHat(0x300)); } + @Test void combiningHookIsToneHat() { assertTrue(VnLangTool.isToneHat(0x309)); } + @Test void combiningTildeIsToneHat() { assertTrue(VnLangTool.isToneHat(0x303)); } + @Test void combiningDotBelowIsToneHat() { assertTrue(VnLangTool.isToneHat(0x323)); } + @Test void combiningCircumflexIsToneHat() { assertTrue(VnLangTool.isToneHat(0x302)); } + @Test void combiningBreveIsToneHat() { assertTrue(VnLangTool.isToneHat(0x306)); } + @Test void combiningHornIsToneHat() { assertTrue(VnLangTool.isToneHat(0x31b)); } + @Test void plainLetterNotToneHat() { assertFalse(VnLangTool.isToneHat('a')); } + @Test void plainLetterCanReceiveTone() { assertTrue(VnLangTool.canPutToneHat('a')); } + @Test void fullyTonedAndHattedLetterCannotReceiveToneOrHat() { assertFalse(VnLangTool.canPutToneHat('ấ')); } + + // --- mergeToneHat --- + + @Test void mergeToneAcuteOnA() { assertEquals('á', VnLangTool.mergeToneHat('a', 0x301)); } + @Test void mergeToneGraveOnA() { assertEquals('à', VnLangTool.mergeToneHat('a', 0x300)); } + @Test void mergeToneHookOnA() { assertEquals('ả', VnLangTool.mergeToneHat('a', 0x309)); } + @Test void mergeToneTildeOnA() { assertEquals('ã', VnLangTool.mergeToneHat('a', 0x303)); } + @Test void mergeToneDotBelowOnA() { assertEquals('ạ', VnLangTool.mergeToneHat('a', 0x323)); } + @Test void mergeToneAcuteOnCircumflexA() { assertEquals('ấ', VnLangTool.mergeToneHat('â', 0x301)); } + @Test void mergeToneAcuteOnBreveA() { assertEquals('ắ', VnLangTool.mergeToneHat('ă', 0x301)); } + @Test void mergeToneDotBelowOnCircumflexE(){ assertEquals('ệ', VnLangTool.mergeToneHat('ê', 0x323)); } + @Test void mergeHatCircumflexOnA() { assertEquals('â', VnLangTool.mergeToneHat('a', 0x302)); } + @Test void mergeHatBreveOnA() { assertEquals('ă', VnLangTool.mergeToneHat('a', 0x306)); } + @Test void mergeHatCircumflexOnE() { assertEquals('ê', VnLangTool.mergeToneHat('e', 0x302)); } + @Test void mergeHatHornOnO() { assertEquals('ơ', VnLangTool.mergeToneHat('o', 0x31b)); } + @Test void mergeHatHornOnU() { assertEquals('ư', VnLangTool.mergeToneHat('u', 0x31b)); } + @Test void noMergeForNonCombiningChar() { assertEquals(-1, VnLangTool.mergeToneHat('a', 'b')); } + @Test void noMergeForTonedChar() { assertEquals(-1, VnLangTool.mergeToneHat('á', 0x301)); } + + // --- normalizeNfd: NFD → NFC combining --- + + @Test void normalizeNfdSimpleWord() { + // NFD encoding of "việt": v i ê 0x323 t → should produce "việt" (NFC-like) + int[] nfd = { 'v', 'i', 0xEA /* ê */, 0x323, 't' }; + int[] result = VnLangTool.normalizeNfd(nfd); + assertEquals(4, result.length, "ê+0x323 should merge into ệ, reducing length"); + assertEquals('v', result[0]); + assertEquals('i', result[1]); + assertEquals('ệ', result[2]); + assertEquals('t', result[3]); + } + + @Test void normalizeNfdAlreadyNfc() { + // NFC "viêt" — 'ê' is already composed, no combining marks + int[] nfc = { 'v', 'i', 0xEA /* ê */, 't' }; + int[] result = VnLangTool.normalizeNfd(nfc); + assertArrayEquals(nfc, result); + } + + @Test void normalizeNfdCollapseDuplicateSpaces() { + int[] input = { 'a', ' ', ' ', 'b' }; + int[] result = VnLangTool.normalizeNfd(input, true); + assertArrayEquals(new int[]{ 'a', ' ', 'b' }, result); + } +} diff --git a/java/coccoc-tokenizer-java/src/test/java/com/coccoc/internal/segment/SegmenterTest.java b/java/coccoc-tokenizer-java/src/test/java/com/coccoc/internal/segment/SegmenterTest.java new file mode 100644 index 0000000..83142c7 --- /dev/null +++ b/java/coccoc-tokenizer-java/src/test/java/com/coccoc/internal/segment/SegmenterTest.java @@ -0,0 +1,229 @@ +package com.coccoc.internal.segment; + +import com.coccoc.Token; +import com.coccoc.internal.build.TriePacker; +import com.coccoc.internal.lang.VnLangTool; +import com.coccoc.internal.trie.MultitermTrie; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +import java.util.List; + +import static org.junit.jupiter.api.Assertions.*; + +class SegmenterTest { + + @BeforeAll + static void init() { + VnLangTool.initSimple(); + } + + private static MultitermTrie simpleTrie() { + TriePacker.HashNode root = TriePacker.buildHashTrie(new String[]{"a"}); + return TriePacker.pack(root); + } + + // Slice 1 — empty input edge case + @Test + void segment_emptyString_returnsEmpty() { + Segmenter seg = new Segmenter(simpleTrie()); + assertTrue(seg.segment("").isEmpty()); + } + + // Slice 2 — ASCII word via single-char fallback merge + @Test + void segment_singleAsciiWord_returnsWordToken() { + Segmenter seg = new Segmenter(simpleTrie()); + List tokens = seg.segment("hello"); + assertEquals(1, tokens.size()); + assertEquals("hello", tokens.get(0).getText()); + assertEquals(Token.Type.WORD, tokens.get(0).getType()); + } + + // Slice 3 — space as explicit SPACE token between two words + @Test + void segment_twoWordsWithSpace_returnsWordSpaceWord() { + Segmenter seg = new Segmenter(simpleTrie()); + List tokens = seg.segment("a b"); + assertEquals(3, tokens.size()); + assertEquals("a", tokens.get(0).getText()); + assertEquals(Token.Type.WORD, tokens.get(0).getType()); + assertEquals(" ", tokens.get(1).getText()); + assertEquals(Token.Type.SPACE, tokens.get(1).getType()); + assertEquals("b", tokens.get(2).getText()); + assertEquals(Token.Type.WORD, tokens.get(2).getType()); + } + + // Slice 4 — punctuation character after a word + @Test + void segment_punctuation_returnsPunctToken() { + Segmenter seg = new Segmenter(simpleTrie()); + List tokens = seg.segment("hello!"); + assertEquals(2, tokens.size()); + assertEquals("hello", tokens.get(0).getText()); + assertEquals(Token.Type.WORD, tokens.get(0).getType()); + assertEquals("!", tokens.get(1).getText()); + assertEquals(Token.Type.PUNCT, tokens.get(1).getType()); + } + + // Slice 5 — digit sequence classified as NUMBER + @Test + void segment_number_returnsNumberToken() { + Segmenter seg = new Segmenter(simpleTrie()); + List tokens = seg.segment("42"); + assertEquals(1, tokens.size()); + assertEquals("42", tokens.get(0).getText()); + assertEquals(Token.Type.NUMBER, tokens.get(0).getType()); + } + + // Slice 6 — known trie entry returned as one token (exercises trie path, not fallback) + @Test + void segment_knownDictWord_returnsSingleToken() { + TriePacker.HashNode root = TriePacker.buildHashTrie(new String[]{"hello"}); + MultitermTrie trie = TriePacker.pack(root); + Segmenter seg = new Segmenter(trie); + List tokens = seg.segment("hello world"); + assertEquals(3, tokens.size()); + assertEquals("hello", tokens.get(0).getText()); + assertEquals(Token.Type.WORD, tokens.get(0).getType()); + assertEquals(Token.Type.SPACE, tokens.get(1).getType()); + assertEquals("world", tokens.get(2).getText()); + assertEquals(Token.Type.WORD, tokens.get(2).getType()); + } + + // M7c — multi-syllable Vietnamese: space (0x20) is a valid trie edge + @Test + void segment_multiSyllableVietnamese_spaceEdge_returnsSingleToken() { + // Build a trie with "xin chao" as one entry (ASCII approximation avoids NFD issues) + TriePacker.HashNode root = TriePacker.buildHashTrie(new String[]{"xin chao"}); + MultitermTrie trie = TriePacker.pack(root); + Segmenter seg = new Segmenter(trie); + List tokens = seg.segment("xin chao"); + // space-edge trie match should return one WORD, not [WORD, SPACE, WORD] + assertEquals(1, tokens.size()); + assertEquals("xin chao", tokens.get(0).getText()); + assertEquals(Token.Type.WORD, tokens.get(0).getType()); + } + + // M7d — sticky syllable segmentation via SyllableTrie Viterbi + @Test + void splitSyllables_knownStickyPhrase_returnsCorrectSyllables() { + com.coccoc.internal.trie.SyllableTrie sylTrie = + com.coccoc.tools.DictCompileTestSupport.buildSyllableTrie("xin", "chao"); + // Empty bigram scores (2 rows, no entries) + com.coccoc.internal.bigram.BigramScores noScores = + new com.coccoc.internal.bigram.BigramScores( + new int[]{0, 0, 0}, new int[]{}, new float[]{}); + + TriePacker.HashNode root = TriePacker.buildHashTrie(new String[0]); + MultitermTrie emptyMt = TriePacker.pack(root); + Segmenter seg = new Segmenter(emptyMt, sylTrie, noScores); + + java.util.List syllables = seg.splitSyllables("xinchao"); + assertEquals(2, syllables.size()); + assertEquals("xin", syllables.get(0)); + assertEquals("chao", syllables.get(1)); + } + + // M7e — HOST mode: split on dots, each label is a WORD token + @Test + void segment_hostMode_splitsOnDots() { + Segmenter seg = new Segmenter(simpleTrie()); + List tokens = seg.segment("www.google.com", + com.coccoc.TokenizeOption.HOST, false); + assertEquals(3, tokens.size()); + assertTrue(tokens.stream().allMatch(t -> t.getType() == Token.Type.WORD)); + assertEquals("www", tokens.get(0).getText()); + assertEquals("google", tokens.get(1).getText()); + assertEquals("com", tokens.get(2).getText()); + } + + // M7e — URL mode: strip scheme, each alphanumeric segment is a WORD token + @Test + void segment_urlMode_stripsSchemeAndSegments() { + Segmenter seg = new Segmenter(simpleTrie()); + List tokens = seg.segment("https://example.com/path", + com.coccoc.TokenizeOption.URL, false); + // expect word tokens: example, com, path (separators omitted or PUNCT) + long wordCount = tokens.stream() + .filter(t -> t.getType() == Token.Type.WORD).count(); + assertTrue(wordCount >= 3, "expected at least 3 WORD tokens, got " + wordCount); + List words = tokens.stream() + .filter(t -> t.getType() == Token.Type.WORD) + .map(Token::getText).collect(java.util.stream.Collectors.toList()); + assertTrue(words.contains("example"), "expected 'example' token"); + assertTrue(words.contains("com"), "expected 'com' token"); + assertTrue(words.contains("path"), "expected 'path' token"); + } + + // M7f — post-hoc: NUMBER + "%" PUNCT → WORD + @Test + void segment_numberPercent_returnsWordToken() { + Segmenter seg = new Segmenter(simpleTrie()); + List tokens = seg.segment("100%"); + assertEquals(1, tokens.size(), "100% should be one token, got: " + tokens); + assertEquals(Token.Type.WORD, tokens.get(0).getType()); + assertEquals("100%", tokens.get(0).getText()); + } + + // M7f — post-hoc: NUMBER + ordinal suffix → WORD + @Test + void segment_ordinalSuffix_returnsWordToken() { + Segmenter seg = new Segmenter(simpleTrie()); + for (String ord : new String[]{"1st", "2nd", "3rd", "4th"}) { + List tokens = seg.segment(ord); + assertEquals(1, tokens.size(), ord + " should be one token, got: " + tokens); + assertEquals(Token.Type.WORD, tokens.get(0).getType(), ord + " should be WORD"); + assertEquals(ord, tokens.get(0).getText()); + } + } + // M8a — keepPunct=false removes SPACE and PUNCT from NORMAL mode result + @Test + void segment_normalMode_keepPunctFalse_removesSpaceAndPunct() { + Segmenter seg = new Segmenter(simpleTrie()); + List tokens = seg.segment("hello!", com.coccoc.TokenizeOption.NORMAL, false); + assertTrue(tokens.stream().noneMatch(t -> + t.getType() == Token.Type.SPACE || t.getType() == Token.Type.PUNCT), + "keepPunct=false must remove SPACE and PUNCT; got: " + tokens); + assertEquals(1, tokens.size()); + assertEquals("hello", tokens.get(0).getText()); + } + + // M8b — keepPunct=false removes SPACE between words + @Test + void segment_normalMode_keepPunctFalse_removesSpaceBetweenWords() { + Segmenter seg = new Segmenter(simpleTrie()); + List tokens = seg.segment("a b", com.coccoc.TokenizeOption.NORMAL, false); + assertEquals(2, tokens.size(), "keepPunct=false should leave only WORD tokens; got: " + tokens); + assertEquals("a", tokens.get(0).getText()); + assertEquals("b", tokens.get(1).getText()); + } + + // M8c — keepPunct=true keeps PUNCT but removes SPACE + @Test + void segment_normalMode_keepPunctTrue_keepsPunctRemovesSpace() { + Segmenter seg = new Segmenter(simpleTrie()); + List tokens = seg.segment("a b!", com.coccoc.TokenizeOption.NORMAL, true); + assertTrue(tokens.stream().noneMatch(t -> t.getType() == Token.Type.SPACE), + "keepPunct=true must still remove SPACE; got: " + tokens); + long punctCount = tokens.stream().filter(t -> t.getType() == Token.Type.PUNCT).count(); + assertEquals(1, punctCount, "keepPunct=true must preserve PUNCT tokens; got: " + tokens); + assertEquals(3, tokens.size(), "expected WORD('a'), WORD('b'), PUNCT('!'); got: " + tokens); + } + // M9 shouldGo regression — compound "viet" must beat sub-words "vi"+"et" + // Without the shouldGo gate, position 2 is re-scanned: "et" (score=1.0) stacks on + // "vi" (score=1.0) giving total 2.0 > 1.0 for "viet", producing 2 tokens. + // With the gate, position 2 is interior and not re-scanned, so "viet" wins. + @Test + void segment_shouldGoGate_prefersCompoundOverSubwords() { + TriePacker.HashNode root = TriePacker.buildHashTrie(new String[]{"vi", "et", "viet"}); + MultitermTrie trie = TriePacker.pack(root); + Segmenter seg = new Segmenter(trie); + List tokens = seg.segment("viet"); + assertEquals(1, tokens.size(), + "shouldGo gate must prevent re-scan at pos 2; expected single token 'viet', got: " + tokens); + assertEquals("viet", tokens.get(0).getText()); + assertEquals(Token.Type.WORD, tokens.get(0).getType()); + } + +} diff --git a/java/coccoc-tokenizer-java/src/test/java/com/coccoc/internal/trie/DoubleArrayTrieTest.java b/java/coccoc-tokenizer-java/src/test/java/com/coccoc/internal/trie/DoubleArrayTrieTest.java new file mode 100644 index 0000000..a4d4869 --- /dev/null +++ b/java/coccoc-tokenizer-java/src/test/java/com/coccoc/internal/trie/DoubleArrayTrieTest.java @@ -0,0 +1,124 @@ +package com.coccoc.internal.trie; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import static org.junit.jupiter.api.Assertions.*; + +/** + * M3 trie runtime lookup tests. + * + * Hand-crafted trie for words {"a", "ab"}: + * Alphabet: {a=0, b=1} + * Pool (indices 0-3): + * Node 0 root: base=1, parent=-1 + * Node 1 'a': base=2, parent=0, isEnding=true, weight=0.5 + * Node 2 vacant: base=0, parent=-1 + * Node 3 'ab': base=0, parent=1, isEnding=true, weight=0.8 + * + * findChild(0,'a')=1, findChild(0,'b')=-1, findChild(1,'b')=3, findChild(1,'a')=-1 + */ +class DoubleArrayTrieTest { + + // charMap: 'a' (97)=0, 'b' (98)=1; all others -1 + private static final int[] CHAR_MAP; + static { + CHAR_MAP = new int['b' + 1]; + java.util.Arrays.fill(CHAR_MAP, -1); + CHAR_MAP['a'] = 0; + CHAR_MAP['b'] = 1; + } + + private static final int[] BASE = { 1, 2, 0, 0 }; + private static final int[] PARENT = { -1, 0, -1, 1 }; + private static final float[] WEIGHT = { 0f, 0.5f, 0f, 0.8f }; + // bit 0 = isEnding, bit 1 = isSpecial + private static final byte[] FLAGS = { 0, 1, 0, 1 }; + private static final int[] INDEX = { -1, -1, -1, -1 }; + + private MultitermTrie multiterm; + private StringSetTrie tlds; + + @BeforeEach + void setUp() { + multiterm = new MultitermTrie(CHAR_MAP, BASE, PARENT, WEIGHT, FLAGS); + // StringSetTrie for {"com", "net"} + tlds = buildTldTrie(); + } + + // --- DoubleArrayTrie findChild --- + + @Test void findChildRootToA() { assertEquals(1, multiterm.findChild(0, 'a')); } + @Test void findChildRootToBMiss() { assertEquals(-1, multiterm.findChild(0, 'b')); } + @Test void findChildNodeAToB() { assertEquals(3, multiterm.findChild(1, 'b')); } + @Test void findChildNodeAToAMiss() { assertEquals(-1, multiterm.findChild(1, 'a')); } + @Test void findChildOutOfAlphabet(){ assertEquals(-1, multiterm.findChild(0, 'z')); } + @Test void findChildAstral() { assertEquals(-1, multiterm.findChild(0, 0x1F600)); } + + // --- MultitermTrie --- + + @Test void rootNotEnding() { assertFalse(multiterm.isEnding(0)); } + @Test void nodeAIsEnding() { assertTrue(multiterm.isEnding(1)); } + @Test void nodeAbIsEnding() { assertTrue(multiterm.isEnding(3)); } + @Test void weightOfNodeA() { assertEquals(0.5f, multiterm.getWeight(1), 1e-6f); } + @Test void weightOfNodeAb() { assertEquals(0.8f, multiterm.getWeight(3), 1e-6f); } + + // --- StringSetTrie contains --- + + @Test void tldComIsContained() { assertTrue(tlds.contains(new int[]{'c','o','m'}, 0, 3)); } + @Test void tldNetIsContained() { assertTrue(tlds.contains(new int[]{'n','e','t'}, 0, 3)); } + @Test void tldOrgNotContained() { assertFalse(tlds.contains(new int[]{'o','r','g'}, 0, 3)); } + @Test void tldCoIsPrefix() { assertFalse(tlds.contains(new int[]{'c','o'}, 0, 2)); } + + /** Walk "ab" through the MultitermTrie and assert we reach an ending node. */ + @Test void walkFullWord() { + int node = 0; + node = multiterm.findChild(node, 'a'); + assertNotEquals(-1, node, "should find 'a' from root"); + node = multiterm.findChild(node, 'b'); + assertNotEquals(-1, node, "should find 'b' from 'a'"); + assertTrue(multiterm.isEnding(node), "end of 'ab' should be an ending node"); + } + + // ----------------------------------------------------------------------- + // Helpers + // ----------------------------------------------------------------------- + + /** + * Build a StringSetTrie containing {"com", "net"} by constructing arrays directly. + * + * Alphabet: c=0, e=1, m=2, n=3, o=4, t=5 (sorted alphabetically) + * Pool layout (manually computed with base values that avoid collisions): + * Node 0 (root): base=10 + * Node 10+0=10 'c': base=20, parent=0 + * Node 10+3=13 'n': base=30, parent=0 + * Node 20+4=24 'co': base=40, parent=10 + * Node 30+1=31 'ne': base=40, parent=13 <-- shares base40 (no child collision) + * Node 40+2=42 'com': parent=24, isEnding + * Node 40+5=45 'net': parent=31, isEnding + * + * Pool size = max(45)+1 = 46. + */ + private static StringSetTrie buildTldTrie() { + // charMap for lowercase letters c,e,m,n,o,t (only these 6 are needed) + int[] cm = new int['t' + 1]; + java.util.Arrays.fill(cm, -1); + cm['c'] = 0; cm['e'] = 1; cm['m'] = 2; + cm['n'] = 3; cm['o'] = 4; cm['t'] = 5; + + int SZ = 46; + int[] base = new int[SZ]; + int[] parent = new int[SZ]; + byte[] ending = new byte[SZ]; + java.util.Arrays.fill(parent, -1); + + base[0] = 10; + base[10] = 20; parent[10] = 0; // 'c' + base[13] = 30; parent[13] = 0; // 'n' + base[24] = 40; parent[24] = 10; // 'co' + base[31] = 40; parent[31] = 13; // 'ne' + parent[42] = 24; ending[42] = 1; // 'com' + parent[45] = 31; ending[45] = 1; // 'net' + + return new StringSetTrie(cm, base, parent, ending); + } +} diff --git a/java/coccoc-tokenizer-java/src/test/java/com/coccoc/tools/DictCompileTest.java b/java/coccoc-tokenizer-java/src/test/java/com/coccoc/tools/DictCompileTest.java new file mode 100644 index 0000000..0247a72 --- /dev/null +++ b/java/coccoc-tokenizer-java/src/test/java/com/coccoc/tools/DictCompileTest.java @@ -0,0 +1,126 @@ +package com.coccoc.tools; + +import com.coccoc.internal.io.VarintReader; +import com.coccoc.internal.build.TriePacker; +import com.coccoc.internal.trie.MultitermTrie; +import org.junit.jupiter.api.Test; +import java.io.ByteArrayInputStream; +import static org.junit.jupiter.api.Assertions.*; + +/** + * M4 unit tests for DictCompile subsystems. + * Tests the three sub-components independently before the full CLI integration. + */ +class DictCompileTest { + + // ----------------------------------------------------------------------- + // 1. VarintReader — 7-bit little-endian varint (buffered_reader.hpp:44-67) + // ----------------------------------------------------------------------- + + @Test void varintSingleByteSeries() throws Exception { + // Three 1-byte values: 0, 5, 127 + // Encoding: first byte has high bit 0; a byte with high bit 0 after power>0 + // terminates the previous number. + // Stream: 0x00 0x05 0x7F — each is a single-byte number. + // But each subsequent byte terminates the previous number when high bit=0. + byte[] raw = { 0x00, 0x05, 0x7F }; + VarintReader vr = new VarintReader(new ByteArrayInputStream(raw)); + assertEquals(0, vr.nextInt()); + assertEquals(5, vr.nextInt()); + assertEquals(127, vr.nextInt()); + } + + @Test void varintTwoBytesValue200() throws Exception { + // 200 = 72 + 128 = 0b11001000 + // First byte (bits 0-6): 72 = 0x48 (high bit 0) + // Continuation byte (bit 7): 1 → 0x80 | 1 = 0x81 (high bit 1) + // Next number start byte (to terminate): 0x00 + byte[] raw = { 0x48, (byte)0x81, 0x00 }; + VarintReader vr = new VarintReader(new ByteArrayInputStream(raw)); + assertEquals(200, vr.nextInt()); + assertEquals(0, vr.nextInt()); + } + + @Test void varintSequenceOfDeltaInts() throws Exception { + // Simulate a typical bigram row: n_pairs=2, delta1=3, val1=50, delta2=5, val2=20 + // Encode 2,3,50,5,20 as varints + byte[] raw = buildVarintStream(2, 3, 50, 5, 20); + VarintReader vr = new VarintReader(new ByteArrayInputStream(raw)); + assertEquals(2, vr.nextInt()); + assertEquals(3, vr.nextInt()); + assertEquals(50, vr.nextInt()); + assertEquals(5, vr.nextInt()); + assertEquals(20, vr.nextInt()); + } + + // ----------------------------------------------------------------------- + // 2. TriePacker — HashTrie → DATrie arrays + // ----------------------------------------------------------------------- + + @Test void packTrieAndLookupWords() { + // Words: "a" (weight=0.5, isEnding), "ab" (weight=0.8, isEnding), "b" (weight=0.3, isEnding) + TriePacker.HashNode root = TriePacker.buildHashTrie(new String[]{"a","ab","b"}); + MultitermTrie trie = TriePacker.pack(root); + + // Walk 'a' + int nodeA = trie.findChild(0, 'a'); + assertTrue(nodeA > 0, "root should have child 'a'"); + assertTrue(trie.isEnding(nodeA), "'a' should be an ending node"); + + // Walk 'ab' + int nodeAb = trie.findChild(nodeA, 'b'); + assertTrue(nodeAb > 0, "node 'a' should have child 'b'"); + assertTrue(trie.isEnding(nodeAb), "'ab' should be an ending node"); + + // Walk 'b' + int nodeB = trie.findChild(0, 'b'); + assertTrue(nodeB > 0, "root should have child 'b'"); + assertTrue(trie.isEnding(nodeB), "'b' should be an ending node"); + + // 'ba' doesn't exist + assertEquals(-1, trie.findChild(nodeB, 'a'), "'ba' should not exist"); + + // 'c' doesn't exist from root + assertEquals(-1, trie.findChild(0, 'c'), "'c' should not exist from root"); + } + + // ----------------------------------------------------------------------- + // 3. Weight formula — MultitermHashTrieNode::finalize() + // ----------------------------------------------------------------------- + + @Test void weightFormulaOneWordTerm() { + // space_count=0 → param[0]=0.38, param[1]=1 + // weight = pow(log2(freq + 3), 0.38) * pow(0+1, 1) + // For freq=10: weight = pow(log2(13), 0.38) * 1 + double expected = Math.pow(Math.log(13) / Math.log(2), 0.38) * Math.pow(1, 1.0); + assertEquals(expected, DictCompile.multitermWeight(10, 0), 1e-5); + } + + @Test void weightFormulaTwoWordTerm() { + // space_count=1 → param[2]=0.14, param[3]=2.59 + // weight = pow(log2(freq + 3), 0.14) * pow(1+1, 2.59) + double expected = Math.pow(Math.log(7) / Math.log(2), 0.14) * Math.pow(2, 2.59); + assertEquals(expected, DictCompile.multitermWeight(4, 1), 1e-5); + } + + // ----------------------------------------------------------------------- + // Helpers + // ----------------------------------------------------------------------- + + private byte[] buildVarintStream(int... values) { + byte[] buf = new byte[values.length * 5]; // worst case 5 bytes per value + int pos = 0; + for (int i = 0; i < values.length; i++) { + int v = values[i]; + // First byte: low 7 bits, high bit 0 + buf[pos++] = (byte)(v & 0x7F); + v >>>= 7; + // Continuation bytes: low 7 bits, high bit 1 + while (v > 0) { + buf[pos++] = (byte)(0x80 | (v & 0x7F)); + v >>>= 7; + } + } + return java.util.Arrays.copyOf(buf, pos); + } +} diff --git a/java/coccoc-tokenizer-java/src/test/java/com/coccoc/tools/DictCompileTestSupport.java b/java/coccoc-tokenizer-java/src/test/java/com/coccoc/tools/DictCompileTestSupport.java new file mode 100644 index 0000000..c7cbfe7 --- /dev/null +++ b/java/coccoc-tokenizer-java/src/test/java/com/coccoc/tools/DictCompileTestSupport.java @@ -0,0 +1,80 @@ +package com.coccoc.tools; + +import com.coccoc.internal.build.SyllablePacker; +import com.coccoc.internal.trie.MultitermTrie; +import com.coccoc.internal.trie.SyllableTrie; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.zip.CRC32; + +/** + * Test helper: exposes DictCompile's package-private binary writers so that + * tests outside com.coccoc.tools can write synthetic .bin files for round-trip + * tests without widening DictCompile's own visibility. + */ +public final class DictCompileTestSupport { + + private DictCompileTestSupport() {} + + public static void writeMultitermBin(Path out, MultitermTrie trie) throws IOException { + DictCompile.writeMultitermBin(out, trie); + } + + public static void writeSyllableBin(Path out, SyllableTrie trie, int syllableCount) + throws IOException { + DictCompile.writeSyllableBin(out, trie, syllableCount); + } + + /** + * Writes a bigram.bin directly from pre-built CSR arrays for round-trip tests. + * Mirrors DictCompile.writeBigramBin's binary layout without requiring dict sources. + */ + public static void writeBigramBin(Path out, int n, + int[] rowOffset, int[] colIndex, float[] value) + throws IOException { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + CRC32 crc = new CRC32(); + baos.write("CCBG".getBytes()); + writeLE32(baos, crc, 1); // version + writeLE32(baos, crc, n); // rowCount + for (int v : rowOffset) writeLE32(baos, crc, v); + for (int v : colIndex) writeLE32(baos, crc, v); + for (float v : value) writeLEFloat(baos, crc, v); + writeLE32NoUpdate(baos, (int) crc.getValue()); + Files.write(out, baos.toByteArray()); + } + + + /** + * Builds a SyllableTrie from a list of words. + * Uses package-private HashTrieBuilder (com.coccoc.tools) + SyllablePacker. + */ + public static SyllableTrie buildSyllableTrie(String... words) { + DictCompile.HashTrieBuilder builder = new DictCompile.HashTrieBuilder(); + for (String w : words) builder.add(w, 1, false); + return SyllablePacker.packFromPool(builder.pool); + } + + // ── LE helpers (mirrors DictCompile package-private helpers) ───────────── + + private static void writeLE32(OutputStream out, CRC32 crc, int v) throws IOException { + byte[] b = {(byte) v, (byte) (v >> 8), (byte) (v >> 16), (byte) (v >> 24)}; + crc.update(b); + out.write(b); + } + + private static void writeLEFloat(OutputStream out, CRC32 crc, float v) throws IOException { + writeLE32(out, crc, Float.floatToRawIntBits(v)); + } + + private static void writeLE32NoUpdate(OutputStream out, int v) throws IOException { + out.write((byte) v); + out.write((byte) (v >> 8)); + out.write((byte) (v >> 16)); + out.write((byte) (v >> 24)); + } +} diff --git a/java/pom.xml b/java/pom.xml new file mode 100644 index 0000000..6a70d69 --- /dev/null +++ b/java/pom.xml @@ -0,0 +1,93 @@ + + + 4.0.0 + + com.coccoc + coccoc-tokenizer-java-parent + 1.0.0-SNAPSHOT + pom + + coccoc-tokenizer Java parent + Pure-Java 21+ port of the CocCoc Vietnamese tokenizer + https://github.com/coccoc/coccoc-tokenizer + + + + GNU Lesser General Public License v3.0 + https://www.gnu.org/licenses/lgpl-3.0.html + repo + + + + + coccoc-tokenizer-java-dicts + coccoc-tokenizer-java + + + + UTF-8 + 21 + 5.11.4 + + 3.5.0 + + ${maven.multiModuleProjectDirectory}/../dicts + + + + + + org.junit.jupiter + junit-jupiter + ${junit.version} + test + + + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.13.0 + + ${maven.compiler.release} + UTF-8 + + + + org.apache.maven.plugins + maven-surefire-plugin + 3.5.2 + + --enable-native-access=ALL-UNNAMED + + + + org.apache.maven.plugins + maven-failsafe-plugin + 3.5.2 + + --enable-native-access=ALL-UNNAMED + + + + org.apache.maven.plugins + maven-jar-plugin + 3.4.2 + + + org.codehaus.mojo + exec-maven-plugin + ${exec.plugin.version} + + + + + diff --git a/java/src/java/Token.java b/java/src/java/Token.java deleted file mode 100644 index 2e08921..0000000 --- a/java/src/java/Token.java +++ /dev/null @@ -1,229 +0,0 @@ -package com.coccoc; - -import java.util.ArrayList; -import java.util.List; - -public final class Token implements Cloneable { - public static Token FULL_STOP = new Token(".", Type.PUNCT, SegType.END_SEG_TYPE, -1, -1); - public static Token COMMA = new Token(",", Type.PUNCT, SegType.END_SEG_TYPE, -1, -1); - public static Token SPACE = new Token(" ", Type.SPACE, -1, -1); - - public enum Type { - WORD, - NUMBER, - SPACE, - PUNCT, - WHOLE_URL, - SITE_URL; - - private static Type[] values = null; - public static Type fromInt(int i) { - if (Type.values == null) { - Type.values = Type.values(); - } - return Type.values[i]; - } - } - - public enum SegType { - OTHER_SEG_TYPE, - SKIP_SEG_TYPE, - URL_SEG_TYPE, - END_URL_TYPE, - END_SEG_TYPE; - - private static SegType[] values = null; - public static SegType fromInt(int i) { - if (SegType.values == null) { - SegType.values = SegType.values(); - } - return SegType.values[i]; - } - }; - - private final String text; - private final Type type; - private SegType segType; // Nullable - private boolean splittedByDot; - private final int startPos; - private final int endPos; - - public Token(String text, int start, int end) { - this(text, Type.WORD, start, end); - } - - public Token(String text, Type type, int start, int end) { - this(text, type, null, start, end); - } - - public Token cloneWithNewText(String newText, int newEnd) { - return new Token(newText, type, segType, splittedByDot, startPos, endPos); - } - - public Token(String text, Type type, SegType segType, int start, int end) { - this(text, type, segType, false, start, end); - } - - public Token(String text, Type type, SegType segType, boolean splittedByDot, int start, int end) { - this.text = text; - this.type = type; - this.segType = segType; - this.splittedByDot = splittedByDot; - this.startPos = start; - this.endPos = end > 0 ? end : start + text.length(); - } - - public String getText() { - return text; - } - - public Type getType() { - return type; - } - - public int getPos() { - return startPos; - } - - public int getEndPos() { - return endPos; - } - - public SegType getSegType() { - return segType; - } - - public static ArrayList toStringList(List tokenList) { - ArrayList temp = new ArrayList<>(); - for (Token token : tokenList) { - temp.add(token.getText()); - } - return temp; - } - - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append(type).append(" `").append(text).append('`'); - if (segType == SegType.END_SEG_TYPE) { - sb.append(" END"); - } else if (segType == SegType.URL_SEG_TYPE) { - sb.append(" URL"); - } else if (segType == SegType.SKIP_SEG_TYPE) { - sb.append(" SKIP"); - } else if (segType == SegType.END_URL_TYPE) { - sb.append(" END_URL"); - } else { - sb.append(" OTHER"); - } - sb.append(' ').append(startPos).append('-').append(endPos); - return sb.toString(); - } - - @Override - public Token clone() { - return new Token(text, type, segType, startPos, endPos); - } - - @Override - public int hashCode() { - int hash = text.hashCode() ^ type.hashCode(); - return hash; - } - - @Override - public boolean equals(Object obj) { - if (this == obj) { - return true; - } - if (obj == null) { - return false; - } - if (getClass() != obj.getClass()) { - return false; - } - final Token that = (Token) obj; - - if (!this.text.equals(that.text)) { - return false; - } - - if (this.type != that.type) { - return false; - } - - return true; - } - - public boolean isWord() { - return type == Type.WORD; - } - - public boolean isPunct() { - return type == Type.PUNCT; - } - - public boolean isNumber() { - return type == Type.NUMBER; - } - - public boolean isWholeUrl() { - return type == Type.WHOLE_URL; - } - - public boolean isSiteUrl() { - return type == Type.SITE_URL; - } - - public boolean isSpace() { - return type == Type.SPACE; - } - - public boolean isEndSeg() { - return segType == SegType.END_SEG_TYPE; - } - - public boolean isSplittedByDot() { - return splittedByDot; - } - - public void setEndSeg() { - segType = SegType.END_SEG_TYPE; - } - - public void setOtherSeg() { - segType = SegType.OTHER_SEG_TYPE; - } - - public void setEndUrlSeg() { - segType = SegType.END_URL_TYPE; - } - - public void setUrlSeg() { - segType = SegType.URL_SEG_TYPE; - } - - public void setSkipSeg() { - segType = SegType.SKIP_SEG_TYPE; - } - - public boolean isUrlSeg() { - return segType == SegType.URL_SEG_TYPE; - } - - public boolean isEndUrlSeg() { - return segType == SegType.END_URL_TYPE; - } - - public boolean isSkipSeg() { - return segType == SegType.SKIP_SEG_TYPE; - } - - public boolean isOtherSeg() { - return segType == SegType.OTHER_SEG_TYPE; - } - - public boolean isWordOrNumber() { - return isWord() || isNumber() || isSiteUrl(); - } -} diff --git a/java/src/java/Tokenizer.java b/java/src/java/Tokenizer.java deleted file mode 100644 index c08d507..0000000 --- a/java/src/java/Tokenizer.java +++ /dev/null @@ -1,142 +0,0 @@ -package com.coccoc; - -import java.util.*; -import java.io.*; - -public class Tokenizer { - public static final int TOKENIZE_NORMAL = 0; - public static final int TOKENIZE_HOST = 1; - public static final int TOKENIZE_URL = 2; - public static final String dictPath = "/usr/share/tokenizer/dicts"; // TODO: don't hardcode this value - - public native long segmentPointer(String text, boolean for_transforming, int tokenizeOption, boolean keep_puncts); - private native void freeMemory(long resPointer); - private native int initialize(String dictPath); - - static { - System.loadLibrary("coccoc_tokenizer_jni"); - } - - private static final class Loader { - private static final Tokenizer INSTANCE = get(); - - private static Tokenizer get() { - Tokenizer instance = new Tokenizer(dictPath); - return instance; - } - } - - public static Tokenizer getInstance() { - return Loader.INSTANCE; - } - - private Tokenizer(String dictPath) { - int status = initialize(dictPath); - if (0 > status) { - throw new RuntimeException("Cannot initialize Tokenizer"); - } - } - - public ArrayList segment(String text, boolean for_transforming, int tokenizeOption, boolean keep_puncts) { - if (text == null) { - throw new IllegalArgumentException("text is null"); - } - long resPointer = segmentPointer(text, for_transforming, tokenizeOption, keep_puncts); - - ArrayList res = new ArrayList<>(); - // Positions from JNI implementation .cpp file - long normalizedStringPointer = Unsafe.UNSAFE.getLong(resPointer + 8); - int rangesSize = (int) Unsafe.UNSAFE.getLong(resPointer + 8 * 2); - long rangesDataPointer = Unsafe.UNSAFE.getLong(resPointer + 8 * 3); - - int spacePositionsSize = (int) Unsafe.UNSAFE.getLong(resPointer + 8 * 5); - long spacePositionsDataPointer = Unsafe.UNSAFE.getLong(resPointer + 8 * 6); - int[] spacePositions = new int[spacePositionsSize + 1]; - for (int i = 0; i < spacePositionsSize; ++i) { - spacePositions[i] = Unsafe.UNSAFE.getInt(spacePositionsDataPointer + i * 4); - } - spacePositions[spacePositionsSize] = -1; - - int TOKEN_SIZE = 4 * 6; - for (int i = 0, spacePos = 0; i < rangesSize; ++i) { - // Positions of UNSAFE values are calculated from {struct Token} in tokenizer.hpp - int startPos = Unsafe.UNSAFE.getInt(rangesDataPointer + i * TOKEN_SIZE); - int endPos = Unsafe.UNSAFE.getInt(rangesDataPointer + i * TOKEN_SIZE + 4); - int originalStartPos = Unsafe.UNSAFE.getInt(rangesDataPointer + i * TOKEN_SIZE + 8); - int originalEndPos = Unsafe.UNSAFE.getInt(rangesDataPointer + i * TOKEN_SIZE + 12); - int type = Unsafe.UNSAFE.getInt(rangesDataPointer + i * TOKEN_SIZE + 16); - int segType = Unsafe.UNSAFE.getInt(rangesDataPointer + i * TOKEN_SIZE + 20); - - // Build substring from UNSAFE array of codepoints - // TODO: Is there a faster way than using StringBuilder? - StringBuilder sb = new StringBuilder(); - for (int j = startPos; j < endPos; ++j) { - if (j == spacePositions[spacePos]) { - sb.append(for_transforming ? '_' : ' '); - spacePos++; - } - sb.appendCodePoint(Unsafe.UNSAFE.getInt(normalizedStringPointer + j * 4)); - } - res.add(new Token(segType == 1 ? sb.toString().replace(',', '.') : sb.toString(), - Token.Type.fromInt(type), Token.SegType.fromInt(segType), originalStartPos, originalEndPos)); - } - if (for_transforming && tokenizeOption == TOKENIZE_NORMAL) { - res.add(Token.FULL_STOP); - } - freeMemory(resPointer); - return res; - } - - public ArrayList segment(String text, boolean for_transforming, int tokenizeOption) { - return segment(text, for_transforming, tokenizeOption, for_transforming); - } - - public ArrayList segment(String text, int tokenizeOption) { - return segment(text, false, tokenizeOption); - } - - public ArrayList segment(String text, boolean for_transforming) { - return segment(text, for_transforming, Tokenizer.TOKENIZE_NORMAL); - } - - public ArrayList segment(String text) { - return segment(text, false); - } - - public ArrayList segmentToStringList(String text) { - return Token.toStringList(segment(text, false)); - } - - public ArrayList segmentKeepPuncts(String text) { - return segment(text, false, TOKENIZE_NORMAL, true); - } - - public ArrayList segmentKeepPunctsToStringList(String text) { - return Token.toStringList(segmentKeepPuncts(text)); - } - - public ArrayList segmentUrl(String text) { - return segment(text, false, TOKENIZE_URL); - } - - public ArrayList segmentUrlToStringList(String text) { - return Token.toStringList(segment(text, false, TOKENIZE_URL)); - } - - public ArrayList segment4Transforming(String text) { - return segment(text, true, TOKENIZE_NORMAL); - } - - public ArrayList segment4Transforming(String text, int tokenizeOption) { - return segment(text, true, tokenizeOption); - } - - public static void main(String[] args) { - for (String text : args) { - for (Token it : getInstance().segment(text)) { - System.out.print(it.getText() + "\t"); - } - System.out.println(); - } - } -} diff --git a/java/src/java/Unsafe.java b/java/src/java/Unsafe.java deleted file mode 100644 index 0ba7d86..0000000 --- a/java/src/java/Unsafe.java +++ /dev/null @@ -1,109 +0,0 @@ -package com.coccoc; - -import java.io.*; -import java.lang.reflect.Field; - -public class Unsafe { - public static final sun.misc.Unsafe UNSAFE; - - static { - sun.misc.Unsafe unsafe = null; - try { - Field field = sun.misc.Unsafe.class.getDeclaredField("theUnsafe"); - field.setAccessible(true); - unsafe = (sun.misc.Unsafe) field.get(null); - } catch (Exception e) { - e.printStackTrace(); - } - UNSAFE = unsafe; - } - - private static final int BUFFER_SIZE = 1024 * 1024 * 16; - private static final String CHARSET = "iso-8859-1"; - - public static void saveUnsafeMemory(OutputStream os, long memory, long size) throws IOException { - for (long i = memory; i < memory + size; i++) { - os.write(UNSAFE.getByte(i)); - } - } - - public static long readToUnsafeMemory(File file) throws IOException { - long len = file.length(); - long memory = UNSAFE.allocateMemory(len); - try (BufferedInputStream fis = new BufferedInputStream(new FileInputStream(file), BUFFER_SIZE)) { - for (long i = 0; i < len; i++) { - UNSAFE.putByte(memory + i, (byte) fis.read()); - } - } - return memory; - } - - public static float getFloat(byte[] buffer, int offset) { - return UNSAFE.getFloat(buffer, (long) (sun.misc.Unsafe.ARRAY_BYTE_BASE_OFFSET + offset)); - } - - public static double getDouble(byte[] buffer, int offset) { - return UNSAFE.getDouble(buffer, (long) (sun.misc.Unsafe.ARRAY_BYTE_BASE_OFFSET + offset)); - } - - public static long getLong(byte[] buffer, int offset) { - return UNSAFE.getLong(buffer, (long) (sun.misc.Unsafe.ARRAY_BYTE_BASE_OFFSET + offset)); - } - - public static int getInt(byte[] buffer, int offset) { - return UNSAFE.getInt(buffer, (long) (sun.misc.Unsafe.ARRAY_BYTE_BASE_OFFSET + offset)); - } - - public static short getShort(byte[] buffer, int offset) { - return UNSAFE.getShort(buffer, (long) (sun.misc.Unsafe.ARRAY_BYTE_BASE_OFFSET + offset)); - } - - public static byte getByte(byte[] buffer, int offset) { - return UNSAFE.getByte(buffer, (long) (sun.misc.Unsafe.ARRAY_BYTE_BASE_OFFSET + offset)); - } - - public static void writeString(String str, OutputStream os, long unsafeBuffer) - throws IOException { - byte[] bytes = str.getBytes(CHARSET); - writeInt(bytes.length, os, unsafeBuffer); - os.write(bytes); - } - - public static void writeInt(int value, OutputStream os, long unsafeBuffer) throws IOException { - Unsafe.UNSAFE.putInt(unsafeBuffer, value); - Unsafe.saveUnsafeMemory(os, unsafeBuffer, 4); - } - - public static String readString(InputStream is) throws IOException { - int length = readInt(is); - byte[] bytes = new byte[length]; - is.read(bytes); - return new String(bytes, CHARSET); - } - - public static int readInt(InputStream is) throws IOException { - byte[] bytes = new byte[4]; - is.read(bytes); - return Unsafe.getInt(bytes, 0); - } - - public static void copy(byte[] values, int length, long pointer) { - UNSAFE.copyMemory(values, sun.misc.Unsafe.ARRAY_BYTE_BASE_OFFSET, null, pointer, length); - } - - public static void copy(byte[] values, int off, int length, long pointer) { - UNSAFE.copyMemory(values, sun.misc.Unsafe.ARRAY_BYTE_BASE_OFFSET + off, null, pointer, length); - } - - public static void copy(short[] values, int length, long pointer) { - UNSAFE.copyMemory(values, sun.misc.Unsafe.ARRAY_SHORT_BASE_OFFSET, null, pointer, length * Short.BYTES); - } - - public static void copy(int[] values, int length, long pointer) { - UNSAFE.copyMemory(values, sun.misc.Unsafe.ARRAY_INT_BASE_OFFSET, null, pointer, length * Integer.BYTES); - } - - public static void copy(long[] values, int length, long pointer) { - UNSAFE.copyMemory(values, sun.misc.Unsafe.ARRAY_LONG_BASE_OFFSET, null, pointer, length * Long.BYTES); - } -} diff --git a/java/src/jni/Tokenizer.cpp b/java/src/jni/Tokenizer.cpp deleted file mode 100644 index 8af334b..0000000 --- a/java/src/jni/Tokenizer.cpp +++ /dev/null @@ -1,64 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include "com_coccoc_Tokenizer.h" - -JNIEXPORT jlong JNICALL Java_com_coccoc_Tokenizer_segmentPointer( - JNIEnv *env, jobject obj, jstring jni_text, jboolean for_transforming, jint tokenize_option, jboolean keep_puncts) -{ - // Use shared-memory instead of message-passing mechanism to transfer data to Java - // return a pointer to an array of pointers - - const jchar *jtext = env->GetStringCritical(jni_text, nullptr); - int text_length = env->GetStringLength(jni_text); - std::vector< uint32_t > *text = new std::vector< uint32_t >(); - text->reserve(text_length); - - std::vector< int > original_pos; - Tokenizer::instance().normalize_for_tokenization(jtext, text_length, *text, original_pos, true); - env->ReleaseStringCritical(jni_text, jtext); - - // use pointer to avoid automatic deallocation - std::vector< Token > *ranges = new std::vector< Token >(); - std::vector< int > *space_positions = new std::vector< int >(); - - Tokenizer::instance().handle_tokenization_request< Token >( - *text, *ranges, *space_positions, original_pos, for_transforming, tokenize_option, keep_puncts); - for (size_t i = 0; i < ranges->size(); ++i) - { - ranges->at(i).original_start += original_pos[ranges->at(i).normalized_start]; - ranges->at(i).original_end += original_pos[ranges->at(i).normalized_end]; - } - - int64_t *res_pointer = new int64_t[8]; - res_pointer[0] = (int64_t) text; - res_pointer[1] = (int64_t) text->data(); // pointer to normalized string - res_pointer[2] = (int64_t) ranges->size(); - res_pointer[3] = (int64_t) ranges->data(); // pointer to raw data array inside vector - res_pointer[4] = (int64_t) ranges; // pointer to actual vector - res_pointer[5] = (int64_t) space_positions->size(); - res_pointer[6] = (int64_t) space_positions->data(); - res_pointer[7] = (int64_t) space_positions; - return (jlong) res_pointer; -} - -JNIEXPORT void JNICALL Java_com_coccoc_Tokenizer_freeMemory(JNIEnv *env, jobject obj, jlong res_pointer) -{ - // Cast each object pointer to their respective type, must be careful - int64_t *p = (int64_t *) res_pointer; - delete (std::vector< uint32_t > *) (p[0]); - delete (std::vector< Token > *) (p[4]); - delete (std::vector< int > *) (p[7]); - delete[](int64_t *) p; -} - -JNIEXPORT jint JNICALL Java_com_coccoc_Tokenizer_initialize(JNIEnv *env, jobject obj, jstring jni_dict_path) -{ - const char *dict_path = env->GetStringUTFChars(jni_dict_path, nullptr); - if (0 > Tokenizer::instance().initialize(std::string(dict_path))) return -1; - env->ReleaseStringUTFChars(jni_dict_path, dict_path); - return 0; -}