From c54fd5985edfba2b4ad6dd22967a387efe287e38 Mon Sep 17 00:00:00 2001 From: Sergio Oller Date: Wed, 1 Nov 2017 12:48:35 +0100 Subject: [PATCH] Test LTS module --- include/cst_lts.h | 7 +- meson.build | 1 + src/lexicon/cst_lts.c | 125 ++++++++++++++++---------- unittests/lts_test_main.c | 178 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 264 insertions(+), 47 deletions(-) create mode 100644 unittests/lts_test_main.c diff --git a/include/cst_lts.h b/include/cst_lts.h index 42a9928..3c97426 100644 --- a/include/cst_lts.h +++ b/include/cst_lts.h @@ -37,9 +37,10 @@ /* Letter to sound rules */ /* */ /*************************************************************************/ -#ifndef _CST_LTS_H__ -#define _CST_LTS_H__ +#ifndef CST_LTS_H +#define CST_LTS_H +#include "cst_lib_visibility.h" #include "cst_string.h" #include "cst_val.h" #include @@ -81,6 +82,6 @@ typedef struct cst_lts_rules_struct { cst_lts_rules *new_lts_rules(); -cst_val *lts_apply(const char *word, const char *feats, const cst_lts_rules *r); +MIMIC_CORE_PUBLIC cst_val *lts_apply(const char *word, const char *feats, const cst_lts_rules *r); #endif diff --git a/meson.build b/meson.build index 74bb01c..0305b8e 100644 --- a/meson.build +++ b/meson.build @@ -458,6 +458,7 @@ unit_tests = [ ['wave', ['-DCOMPILE_EXE_MIMIC_CORE', '-DA_WAV1="' + join_paths(meson.current_source_dir(), 'unittests', 'hello_world.wav') + '"', '-DA_WAV2="' + join_paths(meson.current_source_dir(), 'unittests', 'hi_again.wav') + '"']], + ['lts', ['-DCOMPILE_EXE_MIMIC_CORE']], ['include_public_headers', []] ] diff --git a/src/lexicon/cst_lts.c b/src/lexicon/cst_lts.c index bdcd14e..3785204 100644 --- a/src/lexicon/cst_lts.c +++ b/src/lexicon/cst_lts.c @@ -42,8 +42,11 @@ #include "cst_features.h" #include -static cst_lts_phone apply_model(cst_lts_letter *x_vals, cst_lts_addr start, - const cst_lts_rule *model); +#define MIMIC_WORD_LIMIT 1 +#define MIMIC_OUT_BOUNDS 2 + +static cst_lts_phone apply_model(cst_lts_letter * x_vals, cst_lts_addr start, + const cst_lts_rule * model); cst_lts_rules *new_lts_rules() { @@ -57,37 +60,24 @@ cst_lts_rules *new_lts_rules() return lt; } -cst_val *lts_apply(const char *word, const char *feats, const cst_lts_rules *r) +static int word_to_buff(const char *word, int ctx_size, + cst_lts_letter ** full_buff) { - int pos; - size_t i, i2, i3, num_cp_in_word; + /* Given an UTF-8 encoded word "potato", + * create full_buff, as an integer vector like + * 2 2 2 1 p o t a t o 1 2 2 2 + * where the characters are unicode code points. + * Returns the number of code points in word + */ const cst_val *v; - cst_val *phones = NULL; + size_t i, i2, num_cp_in_word; cst_val *utflets = NULL; - cst_lts_letter *fval_buff; - cst_lts_letter *full_buff; - const cst_lts_letter word_limit = 1, out_bounds = 2; - cst_lts_phone phone; - char *left, *right, *p; - unsigned char utf8char[5]; - if (r->letter_index == NULL) - { - cst_errmsg("The letter_index that gives the initial rule for a given " - "unicode code point is missing. Malformed LTS rules\n"); - return NULL; - } - /* For feature vals for each letter */ - fval_buff = cst_alloc(cst_lts_letter, (r->context_window_size * 2) + - r->context_extra_feats); - /* Buffer with added contexts */ - full_buff = - cst_alloc(cst_lts_letter, (r->context_window_size * 2) + - cst_strlen(word) + 1); /* TBD assumes single POS feat */ - for (i = 0; i < r->context_window_size - 1; i++) + *full_buff = cst_alloc(cst_lts_letter, 2 * ctx_size + cst_strlen(word)); + for (i = 0; i < ctx_size - 1; i++) { - full_buff[i] = out_bounds; + (*full_buff)[i] = MIMIC_OUT_BOUNDS; } - full_buff[i] = word_limit; + (*full_buff)[i] = MIMIC_WORD_LIMIT; ++i; /* the word */ utflets = cst_utf8_explode(word); @@ -95,20 +85,54 @@ cst_val *lts_apply(const char *word, const char *feats, const cst_lts_rules *r) num_cp_in_word = 0; for (v = utflets; v; v = val_cdr(v), ++i) { - full_buff[i] = utf8char_to_cp(val_string(val_car(v))); + (*full_buff)[i] = utf8char_to_cp(val_string(val_car(v))); ++num_cp_in_word; } delete_val(utflets); - full_buff[i] = word_limit; + (*full_buff)[i] = MIMIC_WORD_LIMIT; ++i; - for (i2 = i; i2 < i + r->context_window_size - 1; i2++) + for (i2 = i; i2 < i + ctx_size - 1; i2++) { - full_buff[i2] = out_bounds; + (*full_buff)[i2] = MIMIC_OUT_BOUNDS; } + /* + printf("Full buff: "); + for (size_t jjj = 0; jjj < 2*ctx_size + num_cp_in_word; ++jjj) + { + printf("%d ", full_buff[jjj]); + } + printf("\n"); + */ + return num_cp_in_word; +} + +cst_val *lts_apply(const char *word, const char *feats, + const cst_lts_rules *r) +{ + int pos; + size_t i, i2, i3, num_cp_in_word; + cst_val *phones = NULL; + cst_lts_letter *fval_buff; + cst_lts_letter *full_buff = NULL; + cst_lts_phone phone; + char *left, *right, *p; + unsigned char utf8char[5]; + if (r->letter_index == NULL) + { + cst_errmsg("The letter_index that gives the initial rule for a given " + "unicode code point is missing. Malformed LTS rules\n"); + return NULL; + } + /* Buffer with added contexts */ + num_cp_in_word = word_to_buff(word, r->context_window_size, &full_buff); + + /* For feature vals for each letter */ + fval_buff = cst_alloc(cst_lts_letter, (r->context_window_size * 2) + + r->context_extra_feats); /* Do the prediction backwards so we don't need to reverse the answer */ for (pos = r->context_window_size + num_cp_in_word - 1; - full_buff[pos] != word_limit; pos--) + full_buff[pos] != MIMIC_WORD_LIMIT; pos--) { int index; /* Fill the features buffer for the predictor */ @@ -129,7 +153,7 @@ cst_val *lts_apply(const char *word, const char *feats, const cst_lts_rules *r) bytes, but feats could be UTF-8 parsed as well. If you need this, please open an issue. - */ + */ for (i3 = 0; i3 < cst_strlen(feats); ++i3) { fval_buff[i + i2 + i3] = feats[i3]; @@ -139,7 +163,7 @@ cst_val *lts_apply(const char *word, const char *feats, const cst_lts_rules *r) map 256 cases but in Unicode is not that simple as we may have up to 2^21 cases (although usually less than 256) r->letter_index maps a code point that represents a letter to the initial rule to check - */ + */ cp_to_utf8char(full_buff[pos], utf8char); index = cst_unicode_int_map(r->letter_index, utf8char, 0, 0); if (index == r->letter_index->not_found) @@ -156,12 +180,12 @@ cst_val *lts_apply(const char *word, const char *feats, const cst_lts_rules *r) left = cst_substr(r->phone_table[phone], 0, cst_strlen(r->phone_table[phone]) - cst_strlen(p)); - right = cst_substr( - r->phone_table[phone], - (cst_strlen(r->phone_table[phone]) - cst_strlen(p)) + 1, - (cst_strlen(p) - 1)); + right = cst_substr(r->phone_table[phone], + (cst_strlen(r->phone_table[phone]) - + cst_strlen(p)) + 1, (cst_strlen(p) - 1)); phones = - cons_val(string_val(left), cons_val(string_val(right), phones)); + cons_val(string_val(left), + cons_val(string_val(right), phones)); cst_free(left); cst_free(right); } @@ -182,24 +206,37 @@ static inline cst_lts_feat get_feat(const cst_lts_feat_val feat_val) static inline cst_lts_letter get_val(const cst_lts_feat_val feat_val) { - return (cst_lts_letter)(feat_val & 0x001FFFFF); + return (cst_lts_letter) (feat_val & 0x001FFFFF); } -static cst_lts_phone apply_model(cst_lts_letter *x_vals, cst_lts_addr start, - const cst_lts_rule *model) +static cst_lts_phone apply_model(cst_lts_letter * x_vals, cst_lts_addr start, + const cst_lts_rule * model) { cst_lts_addr nstate = start; cst_lts_feat feat = get_feat(model[nstate].feat_val); cst_lts_letter val = get_val(model[nstate].feat_val); - + /*printf("nstate: %d\n", nstate); */ + //printf("x_vals[0..8]: %d %d %d %d %d %d %d %d\n", + //x_vals[0], x_vals[1], x_vals[2], x_vals[3], x_vals[4], x_vals[5], x_vals[6], x_vals[7]); for (; feat != CST_LTS_EOR;) { if (x_vals[feat] == val) + { + //printf("True: %d ->", nstate); nstate = model[nstate].qtrue; + //printf(" %d \n", nstate); + } else + { + //printf("False: %d ->", nstate); nstate = model[nstate].qfalse; + //printf(" %d \n", nstate); + } feat = get_feat(model[nstate].feat_val); val = get_val(model[nstate].feat_val); } return val; } + +#undef MIMIC_WORD_LIMIT +#undef MIMIC_OUT_BOUNDS diff --git a/unittests/lts_test_main.c b/unittests/lts_test_main.c new file mode 100644 index 0000000..54e8b37 --- /dev/null +++ b/unittests/lts_test_main.c @@ -0,0 +1,178 @@ +/* (C) Sergio Oller 2017 + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of the nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include +#include +#include "cst_val.h" +#include "cst_lexicon.h" +#include "cst_lts.h" +#include "cutest.h" + + +#define LTS_CONTEXT_SIZE 4 +#define CST_LTS_P_NAME (LTS_CONTEXT_SIZE - 1) +#define CST_LTS_N_NAME LTS_CONTEXT_SIZE +#define CST_LTS_PHONE_a 0 +#define CST_LTS_PHONE_b 1 +#define CST_LTS_CODE_POINT_a 0x61 +#define CST_LTS_CODE_POINT_b 0x62 +#define CST_LTS_CODE_POINT_euro 0x20AC + + + +void test_lts(void) +{ + cst_val *pred; + cst_lts_rule r_model[] = { + {((CST_LTS_EOR & 0xFF) << 24) | (CST_LTS_PHONE_a & 0x1FFFFF), -1, -1}, /* a */ + {((CST_LTS_EOR & 0xFF) << 24) | (CST_LTS_PHONE_b & 0x1FFFFF), -1, -1}, /* b */ + {((CST_LTS_P_NAME & 0xFF) << 24) | (CST_LTS_CODE_POINT_b & 0x1FFFFF), 1, 0}, /* if prev is b, go to 1, otherwise go to 0 */ + {((CST_LTS_N_NAME & 0xFF) << 24) | + (CST_LTS_CODE_POINT_euro & 0x1FFFFF), 2, 0} + }; + const char *const phone_table[] = { + "a", + "b" + }; + + int32_t v1[128] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 16 */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 32 */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 48 */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 64 */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }; + int32_t v3_2_2[128] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 64 */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }; + int32_t *v3_2[64] = { + NULL, NULL, v3_2_2, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL + }; + int32_t **v3[16] = { + NULL, NULL, v3_2, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL + }; + map_unicode_to_int letter_index = { + v1, NULL, v3, NULL, 0xFFFD, 0 + }; + cst_lts_rules r = { + "test_rules", + r_model, + phone_table, + LTS_CONTEXT_SIZE, + 0, + &letter_index + }; + + printf("Predicting 'a'\n"); + pred = lts_apply("a", "", &r); + //val_print(stdout, pred); + TEST_CHECK(val_length(pred) == 1); + TEST_CHECK(cst_streq(val_string(val_car(pred)), "a")); + + printf("Predicting 'b'\n"); + pred = lts_apply("b", "", &r); + //val_print(stdout, pred); + TEST_CHECK(val_length(pred) == 1); + TEST_CHECK(cst_streq(val_string(val_car(pred)), "b")); + + printf("Predicting 'ab'\n"); + pred = lts_apply("ab", "", &r); + //val_print(stdout, pred); + TEST_CHECK(val_length(pred) == 2); + TEST_CHECK(cst_streq(val_string(val_car(pred)), "a")); + TEST_CHECK(cst_streq(val_string(val_car(val_cdr(pred))), "b")); + + printf("Predicting 'aab'\n"); + pred = lts_apply("aab", "", &r); + //val_print(stdout, pred); + TEST_CHECK(val_length(pred) == 3); + TEST_CHECK(cst_streq(val_string(val_car(pred)), "a")); + TEST_CHECK(cst_streq(val_string(val_car(val_cdr(pred))), "a")); + TEST_CHECK(cst_streq(val_string(val_car(val_cdr(val_cdr(pred)))), "b")); + + printf("Predicting 'abb'\n"); + pred = lts_apply("abb", "", &r); + //val_print(stdout, pred); + TEST_CHECK(val_length(pred) == 3); + TEST_CHECK(cst_streq(val_string(val_car(pred)), "a")); + TEST_CHECK(cst_streq(val_string(val_car(val_cdr(pred))), "b")); + TEST_CHECK(cst_streq(val_string(val_car(val_cdr(val_cdr(pred)))), "b")); + + printf("Predicting 'aba'\n"); + pred = lts_apply("aba", "", &r); + //val_print(stdout, pred); + TEST_CHECK(val_length(pred) == 3); + TEST_CHECK(cst_streq(val_string(val_car(pred)), "a")); + TEST_CHECK(cst_streq(val_string(val_car(val_cdr(pred))), "b")); + TEST_CHECK(cst_streq(val_string(val_car(val_cdr(val_cdr(pred)))), "b")); + + printf("Predicting 'b€€'\n"); + pred = lts_apply("b€€", "", &r); + //val_print(stdout, pred); + TEST_CHECK(val_length(pred) == 3); + TEST_CHECK(cst_streq(val_string(val_car(pred)), "b")); + TEST_CHECK(cst_streq(val_string(val_car(val_cdr(pred))), "b")); + TEST_CHECK(cst_streq(val_string(val_car(val_cdr(val_cdr(pred)))), "a")); +} + +TEST_LIST = +{ + { + "letter to sound rules", test_lts}, + { + 0} +};