MycroftAI · zeehio · Nov 1, 2017
diff --git a/include/cst_lts.h b/include/cst_lts.h
@@ -37,9 +37,10 @@
 /*  Letter to sound rules                                                */
 /*                                                                       */
 /*************************************************************************/
-#ifndef _CST_LTS_H__
-#define _CST_LTS_H__
+#ifndef CST_LTS_H
+#define CST_LTS_H
 
+#include "cst_lib_visibility.h"
 #include "cst_string.h"
 #include "cst_val.h"
 #include <stdint.h>
@@ -81,6 +82,6 @@ typedef struct cst_lts_rules_struct {
 
 cst_lts_rules *new_lts_rules();
 
-cst_val *lts_apply(const char *word, const char *feats, const cst_lts_rules *r);
+MIMIC_CORE_PUBLIC cst_val *lts_apply(const char *word, const char *feats, const cst_lts_rules *r);
 
 #endif
diff --git a/meson.build b/meson.build
@@ -458,6 +458,7 @@ unit_tests =  [
   ['wave', ['-DCOMPILE_EXE_MIMIC_CORE',
             '-DA_WAV1="' + join_paths(meson.current_source_dir(), 'unittests', 'hello_world.wav') + '"',
             '-DA_WAV2="' + join_paths(meson.current_source_dir(), 'unittests', 'hi_again.wav') + '"']],
+  ['lts', ['-DCOMPILE_EXE_MIMIC_CORE']],
   ['include_public_headers', []]
   ]
 

diff --git a/src/lexicon/cst_lts.c b/src/lexicon/cst_lts.c
@@ -42,8 +42,11 @@
 #include "cst_features.h"
 #include <stdlib.h>
 
-static cst_lts_phone apply_model(cst_lts_letter *x_vals, cst_lts_addr start,
-                                 const cst_lts_rule *model);
+#define MIMIC_WORD_LIMIT 1
+#define MIMIC_OUT_BOUNDS 2
+
+static cst_lts_phone apply_model(cst_lts_letter * x_vals, cst_lts_addr start,
+                                 const cst_lts_rule * model);
 
 cst_lts_rules *new_lts_rules()
 {
@@ -57,58 +60,79 @@ cst_lts_rules *new_lts_rules()
     return lt;
 }
 
-cst_val *lts_apply(const char *word, const char *feats, const cst_lts_rules *r)
+static int word_to_buff(const char *word, int ctx_size,
+                        cst_lts_letter ** full_buff)
 {
-    int pos;
-    size_t i, i2, i3, num_cp_in_word;
+    /* Given an UTF-8 encoded word "potato",
+     * create full_buff, as an integer vector like
+     * 2 2 2 1 p o t a t o 1 2 2 2
+     * where the characters are unicode code points.
+     * Returns the number of code points in word 
+     */
     const cst_val *v;
-    cst_val *phones = NULL;
+    size_t i, i2, num_cp_in_word;
     cst_val *utflets = NULL;
-    cst_lts_letter *fval_buff;
-    cst_lts_letter *full_buff;
-    const cst_lts_letter word_limit = 1, out_bounds = 2;
-    cst_lts_phone phone;
-    char *left, *right, *p;
-    unsigned char utf8char[5];
-    if (r->letter_index == NULL)
-    {
-        cst_errmsg("The letter_index that gives the initial rule for a given "
-                   "unicode code point is missing. Malformed LTS rules\n");
-        return NULL;
-    }
-    /* For feature vals for each letter */
-    fval_buff = cst_alloc(cst_lts_letter, (r->context_window_size * 2) +
-                              r->context_extra_feats);
-    /* Buffer with added contexts */
-    full_buff =
-        cst_alloc(cst_lts_letter, (r->context_window_size * 2) +
-                      cst_strlen(word) + 1); /* TBD assumes single POS feat */
-    for (i = 0; i < r->context_window_size - 1; i++)
+    *full_buff = cst_alloc(cst_lts_letter, 2 * ctx_size + cst_strlen(word));
+    for (i = 0; i < ctx_size - 1; i++)
     {
-        full_buff[i] = out_bounds;
+        (*full_buff)[i] = MIMIC_OUT_BOUNDS;
     }
-    full_buff[i] = word_limit;
+    (*full_buff)[i] = MIMIC_WORD_LIMIT;
     ++i;
     /* the word */
     utflets = cst_utf8_explode(word);
     /* For each UTF-8 character */
     num_cp_in_word = 0;
     for (v = utflets; v; v = val_cdr(v), ++i)
     {
-        full_buff[i] = utf8char_to_cp(val_string(val_car(v)));
+        (*full_buff)[i] = utf8char_to_cp(val_string(val_car(v)));
         ++num_cp_in_word;
     }
     delete_val(utflets);
-    full_buff[i] = word_limit;
+    (*full_buff)[i] = MIMIC_WORD_LIMIT;
     ++i;
-    for (i2 = i; i2 < i + r->context_window_size - 1; i2++)
+    for (i2 = i; i2 < i + ctx_size - 1; i2++)
     {
-        full_buff[i2] = out_bounds;
+        (*full_buff)[i2] = MIMIC_OUT_BOUNDS;
     }
+    /*
+       printf("Full buff: ");
+       for (size_t jjj = 0; jjj <  2*ctx_size + num_cp_in_word; ++jjj)
+       {
+       printf("%d ", full_buff[jjj]);
+       }
+       printf("\n");
+     */
+    return num_cp_in_word;
+}
+
+cst_val *lts_apply(const char *word, const char *feats,
+                   const cst_lts_rules *r)
+{
+    int pos;
+    size_t i, i2, i3, num_cp_in_word;
+    cst_val *phones = NULL;
+    cst_lts_letter *fval_buff;
+    cst_lts_letter *full_buff = NULL;
+    cst_lts_phone phone;
+    char *left, *right, *p;
+    unsigned char utf8char[5];
+    if (r->letter_index == NULL)
+    {
+        cst_errmsg("The letter_index that gives the initial rule for a given "
+                   "unicode code point is missing. Malformed LTS rules\n");
+        return NULL;
+    }
+    /* Buffer with added contexts */
+    num_cp_in_word = word_to_buff(word, r->context_window_size, &full_buff);
+
+    /* For feature vals for each letter */
+    fval_buff = cst_alloc(cst_lts_letter, (r->context_window_size * 2) +
+                          r->context_extra_feats);
 
     /* Do the prediction backwards so we don't need to reverse the answer */
     for (pos = r->context_window_size + num_cp_in_word - 1;
-         full_buff[pos] != word_limit; pos--)
+         full_buff[pos] != MIMIC_WORD_LIMIT; pos--)
     {
         int index;
         /* Fill the features buffer for the predictor */
@@ -129,7 +153,7 @@ cst_val *lts_apply(const char *word, const char *feats, const cst_lts_rules *r)
            bytes, but
            feats could be UTF-8 parsed as well. If you need this, please open an
            issue.
-        */
+         */
         for (i3 = 0; i3 < cst_strlen(feats); ++i3)
         {
             fval_buff[i + i2 + i3] = feats[i3];
@@ -139,7 +163,7 @@ cst_val *lts_apply(const char *word, const char *feats, const cst_lts_rules *r)
            map 256 cases but in Unicode is not that simple as we may have up to
            2^21 cases (although usually less than 256) r->letter_index maps a
            code point that represents a letter to the initial rule to check
-        */
+         */
         cp_to_utf8char(full_buff[pos], utf8char);
         index = cst_unicode_int_map(r->letter_index, utf8char, 0, 0);
         if (index == r->letter_index->not_found)
@@ -156,12 +180,12 @@ cst_val *lts_apply(const char *word, const char *feats, const cst_lts_rules *r)
             left =
                 cst_substr(r->phone_table[phone], 0,
                            cst_strlen(r->phone_table[phone]) - cst_strlen(p));
-            right = cst_substr(
-                r->phone_table[phone],
-                (cst_strlen(r->phone_table[phone]) - cst_strlen(p)) + 1,
-                (cst_strlen(p) - 1));
+            right = cst_substr(r->phone_table[phone],
+                               (cst_strlen(r->phone_table[phone]) -
+                                cst_strlen(p)) + 1, (cst_strlen(p) - 1));
             phones =
-                cons_val(string_val(left), cons_val(string_val(right), phones));
+                cons_val(string_val(left),
+                         cons_val(string_val(right), phones));
             cst_free(left);
             cst_free(right);
         }
@@ -182,24 +206,37 @@ static inline cst_lts_feat get_feat(const cst_lts_feat_val feat_val)
 
 static inline cst_lts_letter get_val(const cst_lts_feat_val feat_val)
 {
-    return (cst_lts_letter)(feat_val & 0x001FFFFF);
+    return (cst_lts_letter) (feat_val & 0x001FFFFF);
 }
 
-static cst_lts_phone apply_model(cst_lts_letter *x_vals, cst_lts_addr start,
-                                 const cst_lts_rule *model)
+static cst_lts_phone apply_model(cst_lts_letter * x_vals, cst_lts_addr start,
+                                 const cst_lts_rule * model)
 {
     cst_lts_addr nstate = start;
     cst_lts_feat feat = get_feat(model[nstate].feat_val);
     cst_lts_letter val = get_val(model[nstate].feat_val);
-
+    /*printf("nstate: %d\n", nstate); */
+    //printf("x_vals[0..8]: %d %d %d %d %d %d %d %d\n",
+    //x_vals[0], x_vals[1], x_vals[2], x_vals[3], x_vals[4], x_vals[5], x_vals[6], x_vals[7]);
     for (; feat != CST_LTS_EOR;)
     {
         if (x_vals[feat] == val)
+        {
+            //printf("True: %d ->", nstate);
             nstate = model[nstate].qtrue;
+            //printf(" %d \n", nstate);
+        }
         else
+        {
+            //printf("False: %d ->", nstate);
             nstate = model[nstate].qfalse;
+            //printf(" %d \n", nstate);
+        }
         feat = get_feat(model[nstate].feat_val);
         val = get_val(model[nstate].feat_val);
     }
     return val;
 }
+
+#undef MIMIC_WORD_LIMIT
+#undef MIMIC_OUT_BOUNDS