Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions include/cst_lts.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,10 @@
/* Letter to sound rules */
/* */
/*************************************************************************/
#ifndef _CST_LTS_H__
#define _CST_LTS_H__
#ifndef CST_LTS_H
#define CST_LTS_H

#include "cst_lib_visibility.h"
#include "cst_string.h"
#include "cst_val.h"
#include <stdint.h>
Expand Down Expand Up @@ -81,6 +82,6 @@ typedef struct cst_lts_rules_struct {

cst_lts_rules *new_lts_rules();

cst_val *lts_apply(const char *word, const char *feats, const cst_lts_rules *r);
MIMIC_CORE_PUBLIC cst_val *lts_apply(const char *word, const char *feats, const cst_lts_rules *r);

#endif
1 change: 1 addition & 0 deletions meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -458,6 +458,7 @@ unit_tests = [
['wave', ['-DCOMPILE_EXE_MIMIC_CORE',
'-DA_WAV1="' + join_paths(meson.current_source_dir(), 'unittests', 'hello_world.wav') + '"',
'-DA_WAV2="' + join_paths(meson.current_source_dir(), 'unittests', 'hi_again.wav') + '"']],
['lts', ['-DCOMPILE_EXE_MIMIC_CORE']],
['include_public_headers', []]
]

Expand Down
125 changes: 81 additions & 44 deletions src/lexicon/cst_lts.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,11 @@
#include "cst_features.h"
#include <stdlib.h>

static cst_lts_phone apply_model(cst_lts_letter *x_vals, cst_lts_addr start,
const cst_lts_rule *model);
#define MIMIC_WORD_LIMIT 1
#define MIMIC_OUT_BOUNDS 2

static cst_lts_phone apply_model(cst_lts_letter * x_vals, cst_lts_addr start,
const cst_lts_rule * model);

cst_lts_rules *new_lts_rules()
{
Expand All @@ -57,58 +60,79 @@ cst_lts_rules *new_lts_rules()
return lt;
}

cst_val *lts_apply(const char *word, const char *feats, const cst_lts_rules *r)
static int word_to_buff(const char *word, int ctx_size,
cst_lts_letter ** full_buff)
{
int pos;
size_t i, i2, i3, num_cp_in_word;
/* Given an UTF-8 encoded word "potato",
* create full_buff, as an integer vector like
* 2 2 2 1 p o t a t o 1 2 2 2
* where the characters are unicode code points.
* Returns the number of code points in word
*/
const cst_val *v;
cst_val *phones = NULL;
size_t i, i2, num_cp_in_word;
cst_val *utflets = NULL;
cst_lts_letter *fval_buff;
cst_lts_letter *full_buff;
const cst_lts_letter word_limit = 1, out_bounds = 2;
cst_lts_phone phone;
char *left, *right, *p;
unsigned char utf8char[5];
if (r->letter_index == NULL)
{
cst_errmsg("The letter_index that gives the initial rule for a given "
"unicode code point is missing. Malformed LTS rules\n");
return NULL;
}
/* For feature vals for each letter */
fval_buff = cst_alloc(cst_lts_letter, (r->context_window_size * 2) +
r->context_extra_feats);
/* Buffer with added contexts */
full_buff =
cst_alloc(cst_lts_letter, (r->context_window_size * 2) +
cst_strlen(word) + 1); /* TBD assumes single POS feat */
for (i = 0; i < r->context_window_size - 1; i++)
*full_buff = cst_alloc(cst_lts_letter, 2 * ctx_size + cst_strlen(word));
for (i = 0; i < ctx_size - 1; i++)
{
full_buff[i] = out_bounds;
(*full_buff)[i] = MIMIC_OUT_BOUNDS;
}
full_buff[i] = word_limit;
(*full_buff)[i] = MIMIC_WORD_LIMIT;
++i;
/* the word */
utflets = cst_utf8_explode(word);
/* For each UTF-8 character */
num_cp_in_word = 0;
for (v = utflets; v; v = val_cdr(v), ++i)
{
full_buff[i] = utf8char_to_cp(val_string(val_car(v)));
(*full_buff)[i] = utf8char_to_cp(val_string(val_car(v)));
++num_cp_in_word;
}
delete_val(utflets);
full_buff[i] = word_limit;
(*full_buff)[i] = MIMIC_WORD_LIMIT;
++i;
for (i2 = i; i2 < i + r->context_window_size - 1; i2++)
for (i2 = i; i2 < i + ctx_size - 1; i2++)
{
full_buff[i2] = out_bounds;
(*full_buff)[i2] = MIMIC_OUT_BOUNDS;
}
/*
printf("Full buff: ");
for (size_t jjj = 0; jjj < 2*ctx_size + num_cp_in_word; ++jjj)
{
printf("%d ", full_buff[jjj]);
}
printf("\n");
*/
return num_cp_in_word;
}

cst_val *lts_apply(const char *word, const char *feats,
const cst_lts_rules *r)
{
int pos;
size_t i, i2, i3, num_cp_in_word;
cst_val *phones = NULL;
cst_lts_letter *fval_buff;
cst_lts_letter *full_buff = NULL;
cst_lts_phone phone;
char *left, *right, *p;
unsigned char utf8char[5];
if (r->letter_index == NULL)
{
cst_errmsg("The letter_index that gives the initial rule for a given "
"unicode code point is missing. Malformed LTS rules\n");
return NULL;
}
/* Buffer with added contexts */
num_cp_in_word = word_to_buff(word, r->context_window_size, &full_buff);

/* For feature vals for each letter */
fval_buff = cst_alloc(cst_lts_letter, (r->context_window_size * 2) +
r->context_extra_feats);

/* Do the prediction backwards so we don't need to reverse the answer */
for (pos = r->context_window_size + num_cp_in_word - 1;
full_buff[pos] != word_limit; pos--)
full_buff[pos] != MIMIC_WORD_LIMIT; pos--)
{
int index;
/* Fill the features buffer for the predictor */
Expand All @@ -129,7 +153,7 @@ cst_val *lts_apply(const char *word, const char *feats, const cst_lts_rules *r)
bytes, but
feats could be UTF-8 parsed as well. If you need this, please open an
issue.
*/
*/
for (i3 = 0; i3 < cst_strlen(feats); ++i3)
{
fval_buff[i + i2 + i3] = feats[i3];
Expand All @@ -139,7 +163,7 @@ cst_val *lts_apply(const char *word, const char *feats, const cst_lts_rules *r)
map 256 cases but in Unicode is not that simple as we may have up to
2^21 cases (although usually less than 256) r->letter_index maps a
code point that represents a letter to the initial rule to check
*/
*/
cp_to_utf8char(full_buff[pos], utf8char);
index = cst_unicode_int_map(r->letter_index, utf8char, 0, 0);
if (index == r->letter_index->not_found)
Expand All @@ -156,12 +180,12 @@ cst_val *lts_apply(const char *word, const char *feats, const cst_lts_rules *r)
left =
cst_substr(r->phone_table[phone], 0,
cst_strlen(r->phone_table[phone]) - cst_strlen(p));
right = cst_substr(
r->phone_table[phone],
(cst_strlen(r->phone_table[phone]) - cst_strlen(p)) + 1,
(cst_strlen(p) - 1));
right = cst_substr(r->phone_table[phone],
(cst_strlen(r->phone_table[phone]) -
cst_strlen(p)) + 1, (cst_strlen(p) - 1));
phones =
cons_val(string_val(left), cons_val(string_val(right), phones));
cons_val(string_val(left),
cons_val(string_val(right), phones));
cst_free(left);
cst_free(right);
}
Expand All @@ -182,24 +206,37 @@ static inline cst_lts_feat get_feat(const cst_lts_feat_val feat_val)

static inline cst_lts_letter get_val(const cst_lts_feat_val feat_val)
{
return (cst_lts_letter)(feat_val & 0x001FFFFF);
return (cst_lts_letter) (feat_val & 0x001FFFFF);
}

static cst_lts_phone apply_model(cst_lts_letter *x_vals, cst_lts_addr start,
const cst_lts_rule *model)
static cst_lts_phone apply_model(cst_lts_letter * x_vals, cst_lts_addr start,
const cst_lts_rule * model)
{
cst_lts_addr nstate = start;
cst_lts_feat feat = get_feat(model[nstate].feat_val);
cst_lts_letter val = get_val(model[nstate].feat_val);

/*printf("nstate: %d\n", nstate); */
//printf("x_vals[0..8]: %d %d %d %d %d %d %d %d\n",
//x_vals[0], x_vals[1], x_vals[2], x_vals[3], x_vals[4], x_vals[5], x_vals[6], x_vals[7]);
for (; feat != CST_LTS_EOR;)
{
if (x_vals[feat] == val)
{
//printf("True: %d ->", nstate);
nstate = model[nstate].qtrue;
//printf(" %d \n", nstate);
}
else
{
//printf("False: %d ->", nstate);
nstate = model[nstate].qfalse;
//printf(" %d \n", nstate);
}
feat = get_feat(model[nstate].feat_val);
val = get_val(model[nstate].feat_val);
}
return val;
}

#undef MIMIC_WORD_LIMIT
#undef MIMIC_OUT_BOUNDS
Loading