From 015d9ebeff42bef4a4bb09ac7b75f47fa6d45282 Mon Sep 17 00:00:00 2001 From: Subin Siby Date: Sat, 12 Sep 2020 22:37:00 +0530 Subject: [PATCH 01/17] Intial port of varnamc from Ruby to C * Text transliteration works --- varnamc.c | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 varnamc.c diff --git a/varnamc.c b/varnamc.c new file mode 100644 index 0000000..9e63a78 --- /dev/null +++ b/varnamc.c @@ -0,0 +1,94 @@ +#include "varnam.h" + +#include +#include + +static char doc[] = "an Indic language transliteration library"; +static char args_doc[] = ""; +static struct argp_option options[] = { + { "symbols", 's', "VALUE", 0, "Sets the symbols file"}, + { "text", 't', "TEXT", 0, "Transliterate the given text"}, + { 0 } +}; + +struct arguments { + char *symbols; + char *text; +}; + +static error_t parse_opt(int key, char *arg, struct argp_state *state) { + struct arguments *arguments = state->input; + switch (key) { + case 's': arguments->symbols = arg; break; + case 't': arguments->text = arg; break; + case ARGP_KEY_ARG: return 0; + default: return ARGP_ERR_UNKNOWN; + } + return 0; +} + +static struct argp argp = { options, parse_opt, args_doc, doc, 0, 0, 0 }; + +static void +print_transliteration_output(const char *pattern, varray *words) +{ + int i; + vword *word; + for (i = 0; i < varray_length (words); i++) + { + word = varray_get (words, i); + printf (" %s. Confidence %d\n", word->text, word->confidence); + } +} + +int transliterate(varnam *handle, char *text) +{ + int rc; + varray *words; + + rc = varnam_transliterate (handle, text, &words); + if (rc != VARNAM_SUCCESS) + { + printf("Transliteration failed. Reason - %s", varnam_get_last_error(handle)); + varnam_destroy(handle); + return 1; + } + print_transliteration_output ("malayalam", words); +} + +int main(int argc, char *argv[]) +{ + struct arguments arguments; + + arguments.symbols = NULL; + arguments.text = NULL; + + argp_parse(&argp, argc, argv, 0, 0, &arguments); + + int rc; + varnam *handle; + char *msg; + + if (arguments.symbols == NULL) { + printf("varnamc : Can't load symbols file. Use --symbols option to specify the symbols file"); + return 0; + } + + /* Initialization */ + rc = varnam_init_from_id (arguments.symbols, &handle, &msg); + + printf("%d", rc); + if (rc != VARNAM_SUCCESS) + { + printf("Initialization failed. Reason - %s", msg); + free (msg); + return 1; + } + + if (arguments.text != NULL) + { + return transliterate(handle, arguments.text); + } + + return 0; +} \ No newline at end of file From 1602c6c5ac5d59756da558d08798f013685ca495 Mon Sep 17 00:00:00 2001 From: Subin Siby Date: Tue, 15 Sep 2020 19:20:12 +0530 Subject: [PATCH 02/17] Add cmake rule to build the new C version of `varnamc`. Move old Ruby as `varnamc.rb` --- .gitignore | 2 ++ CMakeLists.txt | 5 +++++ varnamc => varnamc.rb | 0 3 files changed, 7 insertions(+) rename varnamc => varnamc.rb (100%) diff --git a/.gitignore b/.gitignore index 78e453d..e60e7be 100644 --- a/.gitignore +++ b/.gitignore @@ -49,3 +49,5 @@ examples/stemmer distribution-tarball/ testrun.log testrun.xml + +varnamc diff --git a/CMakeLists.txt b/CMakeLists.txt index edcec9f..9117c1c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -19,6 +19,7 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/deps/cma set(VARNAM_LIBRARY_NAME "varnam") set(VARNAM_LIBRARY_NAME_STATIC "varnamstatic") +set(VARNAM_BINARY_NAME "varnamc") set(DEPS_LIBRARY_NAME "deps") set(VARNAM_VERSION_MAJOR 3) @@ -207,3 +208,7 @@ else() target_link_libraries(${VARNAM_LIBRARY_NAME} pthread dl ${SQLITE3_LIBRARIES}) ENDIF() +FILE(GLOB_RECURSE SOURCES varnamc.c) +ADD_EXECUTABLE(${VARNAM_BINARY_NAME} ${SOURCES}) + +target_link_libraries(${VARNAM_BINARY_NAME} ${VARNAM_LIBRARY_NAME}) \ No newline at end of file diff --git a/varnamc b/varnamc.rb similarity index 100% rename from varnamc rename to varnamc.rb From b49b9df9b5a6cac08811f629507f151d9070629a Mon Sep 17 00:00:00 2001 From: Subin Siby Date: Tue, 15 Sep 2020 23:20:32 +0530 Subject: [PATCH 03/17] Replace occurences of Ruby varnamc binary --- CMakeLists.txt | 2 +- tests/varnamc_tests.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9117c1c..ce4bf90 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -157,7 +157,7 @@ if (BUILD_VST) # Each scheme will have a target to compile # vst will have a dependency to all these targets so that running vst will compile all the scheme files foreach(scheme ${SUPPORTED_SCHEMES}) - add_custom_target (${scheme}.vst COMMAND ./varnamc --compile schemes/${scheme}) + add_custom_target (${scheme}.vst COMMAND ./varnamc.rb --compile schemes/${scheme}) add_dependencies (vst "${scheme}.vst") install (FILES schemes/${scheme}.vst DESTINATION ${CMAKE_INSTALL_PREFIX}/share/varnam/vst OPTIONAL) endforeach() diff --git a/tests/varnamc_tests.c b/tests/varnamc_tests.c index 8d8b928..8534759 100644 --- a/tests/varnamc_tests.c +++ b/tests/varnamc_tests.c @@ -12,7 +12,7 @@ START_TEST (learn_failures_file_should_not_be_created_always) { int exitcode; - exitcode = system ("../varnamc -s ml -t varnam -d output/"); + exitcode = system ("../varnamc.rb -s ml -t varnam -d output/"); ck_assert_int_eq (0, exitcode); ck_assert_int_eq (0, file_exist ("output/varnamc-learn-failures.txt")); ck_assert_int_eq (0, file_exist ("output/varnamc-train-failures.txt")); @@ -27,7 +27,7 @@ START_TEST (learn_failures_file_should_be_created_upon_failures) filename = create_text_file ("not-valid-indic-word"); command = strbuf_init (20); - strbuf_addf (command, "../varnamc -s ml --learn-from %s -d output/", filename); + strbuf_addf (command, "../varnamc.rb -s ml --learn-from %s -d output/", filename); exitcode = system (strbuf_to_s (command)); ck_assert_int_eq (0, exitcode); ck_assert_int_eq (1, file_exist ("output/varnamc-learn-failures.txt")); @@ -50,7 +50,7 @@ START_TEST (training_failures_file_should_be_created_upon_failures) filename = create_text_file ("not-valid-indic-word"); command = strbuf_init (20); - strbuf_addf (command, "../varnamc -s ml --train-from %s -d output/", filename); + strbuf_addf (command, "../varnamc.rb -s ml --train-from %s -d output/", filename); exitcode = system (strbuf_to_s (command)); ck_assert_int_eq (0, exitcode); ck_assert_int_eq (1, file_exist ("output/varnamc-train-failures.txt")); From 60821868367b992ad0a98e4b974c92cbf5ab33f7 Mon Sep 17 00:00:00 2001 From: Subin Siby Date: Wed, 16 Sep 2020 01:33:23 +0530 Subject: [PATCH 04/17] Added --learn (-n). Bugfixes --- varnamc.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 56 insertions(+), 3 deletions(-) diff --git a/varnamc.c b/varnamc.c index 9e63a78..4e51cbe 100644 --- a/varnamc.c +++ b/varnamc.c @@ -8,12 +8,14 @@ static char args_doc[] = ""; static struct argp_option options[] = { { "symbols", 's', "VALUE", 0, "Sets the symbols file"}, { "text", 't', "TEXT", 0, "Transliterate the given text"}, + { "learn", 'n', "TEXT", 0, "Learn the given text"}, { 0 } }; struct arguments { char *symbols; char *text; + char *learn; }; static error_t parse_opt(int key, char *arg, struct argp_state *state) { @@ -21,6 +23,7 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state) { switch (key) { case 's': arguments->symbols = arg; break; case 't': arguments->text = arg; break; + case 'n': arguments->learn = arg; break; case ARGP_KEY_ARG: return 0; default: return ARGP_ERR_UNKNOWN; } @@ -29,6 +32,31 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state) { static struct argp argp = { options, parse_opt, args_doc, doc, 0, 0, 0 }; +/** + * ----- + * Helper Functions + * ----- + */ + +/* Check if text is not a sentence */ +void ensure_single_word(char *text) { + int i = 0; + int s_length = strlen(text); + while(i < s_length) + { + if(text[i++] == ' ') { + printf("varnamc : Expected a single word."); + exit(1); + } + } +} + +/** + * ----- + * End Helper Functions + * ----- + */ + static void print_transliteration_output(const char *pattern, varray *words) { @@ -46,14 +74,35 @@ int transliterate(varnam *handle, char *text) int rc; varray *words; + ensure_single_word(text); + rc = varnam_transliterate (handle, text, &words); if (rc != VARNAM_SUCCESS) { printf("Transliteration failed. Reason - %s", varnam_get_last_error(handle)); varnam_destroy(handle); - return 1; + exit(1); } print_transliteration_output ("malayalam", words); + exit(0); +} + +/** + * Learn a word + */ +int learn(varnam *handle, char *text) +{ + ensure_single_word(text); + + int rc = varnam_learn (handle, text); + if (rc != VARNAM_SUCCESS) + { + printf("%s", varnam_get_last_error(handle)); + varnam_destroy(handle); + exit(1); + } + printf("Learned %s", text); + exit(0); } int main(int argc, char *argv[]) @@ -62,6 +111,7 @@ int main(int argc, char *argv[]) arguments.symbols = NULL; arguments.text = NULL; + arguments.learn = NULL; argp_parse(&argp, argc, argv, 0, 0, &arguments); @@ -77,7 +127,6 @@ int main(int argc, char *argv[]) /* Initialization */ rc = varnam_init_from_id (arguments.symbols, &handle, &msg); - printf("%d", rc); if (rc != VARNAM_SUCCESS) { printf("Initialization failed. Reason - %s", msg); @@ -87,8 +136,12 @@ int main(int argc, char *argv[]) if (arguments.text != NULL) { - return transliterate(handle, arguments.text); + transliterate(handle, arguments.text); + } else if (arguments.learn != NULL) + { + learn(handle, arguments.learn); } + /* 0 means program executed successfully */ return 0; } \ No newline at end of file From 586897bc0308f5daeaeec5edb2f61167e2dca5f2 Mon Sep 17 00:00:00 2001 From: Subin Siby Date: Thu, 17 Sep 2020 01:16:29 +0530 Subject: [PATCH 05/17] Added --train (-a) --- varnamc.c | 126 +++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 115 insertions(+), 11 deletions(-) diff --git a/varnamc.c b/varnamc.c index 4e51cbe..7521855 100644 --- a/varnamc.c +++ b/varnamc.c @@ -9,6 +9,7 @@ static struct argp_option options[] = { { "symbols", 's', "VALUE", 0, "Sets the symbols file"}, { "text", 't', "TEXT", 0, "Transliterate the given text"}, { "learn", 'n', "TEXT", 0, "Learn the given text"}, + { "train", 'a', "PATTERN=WORD", 0, "Train the given text"}, { 0 } }; @@ -16,6 +17,7 @@ struct arguments { char *symbols; char *text; char *learn; + char *train; }; static error_t parse_opt(int key, char *arg, struct argp_state *state) { @@ -24,6 +26,7 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state) { case 's': arguments->symbols = arg; break; case 't': arguments->text = arg; break; case 'n': arguments->learn = arg; break; + case 'a': arguments->train = arg; break; case ARGP_KEY_ARG: return 0; default: return ARGP_ERR_UNKNOWN; } @@ -39,18 +42,68 @@ static struct argp argp = { options, parse_opt, args_doc, doc, 0, 0, 0 }; */ /* Check if text is not a sentence */ -void ensure_single_word(char *text) { +void ensure_single_word(char *text) +{ int i = 0; - int s_length = strlen(text); + size_t s_length = strlen(text); while(i < s_length) { - if(text[i++] == ' ') { + if(text[i++] == ' ') + { printf("varnamc : Expected a single word."); exit(1); } } } + +/** + * splits str on delim and dynamically allocates an array of pointers. + * + * On error -1 is returned, check errno + * On success size of array is returned, which may be 0 on an empty string + * or 1 if no delim was found. + * + * You could rewrite this to return the char ** array instead and upon NULL + * know it's an allocation problem but I did the triple array here. Note that + * upon the hitting two delim's in a row "foo,,bar" the array would be: + * { "foo", NULL, "bar" } + * + * You need to define the semantics of a trailing delim Like "foo," is that a + * 2 count array or an array of one? I choose the two count with the second entry + * set to NULL since it's valueless. + * Modifies str so make a copy if this is a problem + */ +int split( char * str, char delim, char ***array ) { + char *p; + char **res; + int count=0; + int k=0; + + p = str; + /* Count occurance of delim in string */ + while( (p = strchr(p, delim)) != NULL ) { + *p = 0; /* Null terminate the deliminator. */ + p++; /* Skip past our new null */ + count++; + } + + /* allocate dynamic array */ + res = calloc( 1, count * sizeof(char *)); + if( !res ) return -1; + + p = str; + for(k = 0; k < count + 1; k++){ + if( *p ) res[k] = p; /* Copy start of string */ + p = strchr(p, 0 ); /* Look for next null */ + p++; /* Start of next string */ + } + + *array = res; + + return count + 1; +} + /** * ----- * End Helper Functions @@ -69,6 +122,9 @@ print_transliteration_output(const char *pattern, varray *words) } } +/** + * Transliterate a word + */ int transliterate(varnam *handle, char *text) { int rc; @@ -90,24 +146,64 @@ int transliterate(varnam *handle, char *text) /** * Learn a word */ -int learn(varnam *handle, char *text) +int learn(varnam *handle, char *word) { - ensure_single_word(text); + int rc; + ensure_single_word(word); - int rc = varnam_learn (handle, text); + rc = varnam_learn (handle, word); if (rc != VARNAM_SUCCESS) { printf("%s", varnam_get_last_error(handle)); varnam_destroy(handle); exit(1); } - printf("Learned %s", text); + printf("Learned %s", word); + exit(0); +} + +/** + * Train a word + */ +const char* perform_training(varnam *handle, char *pattern, char *word) +{ + int rc; + + ensure_single_word(pattern); + ensure_single_word(word); + + rc = varnam_train(handle, pattern, word); + if (rc != VARNAM_SUCCESS) + { + const char *error_message = varnam_get_last_error(handle); + return error_message; + } + return NULL; +} + +int train(varnam *handle, char *pattern, char *word) +{ + const char* error; + + ensure_single_word(word); + + error = perform_training(handle, pattern, word); + if (error != NULL) + { + printf("%s", error); + varnam_destroy(handle); + exit(1); + } + printf("Success. %s will resolve to %s", pattern, word); exit(0); } int main(int argc, char *argv[]) { struct arguments arguments; + int rc; + varnam *handle; + char *msg; arguments.symbols = NULL; arguments.text = NULL; @@ -115,10 +211,6 @@ int main(int argc, char *argv[]) argp_parse(&argp, argc, argv, 0, 0, &arguments); - int rc; - varnam *handle; - char *msg; - if (arguments.symbols == NULL) { printf("varnamc : Can't load symbols file. Use --symbols option to specify the symbols file"); return 0; @@ -140,6 +232,18 @@ int main(int argc, char *argv[]) } else if (arguments.learn != NULL) { learn(handle, arguments.learn); + } else if (arguments.train != NULL) + { + char **tokens; + int count = split(arguments.train, '=', &tokens); + + if (count == 2) + { + train(handle, tokens[0], tokens[1]); + } else + { + printf("varnamc : Incorrect arguments"); + } } /* 0 means program executed successfully */ From b8d3bdce808d40e1665c2de20c6b780918357d1c Mon Sep 17 00:00:00 2001 From: Subin Siby Date: Thu, 17 Sep 2020 01:48:22 +0530 Subject: [PATCH 06/17] Set default value for all args as NULL --- varnamc.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/varnamc.c b/varnamc.c index 7521855..88d358f 100644 --- a/varnamc.c +++ b/varnamc.c @@ -200,15 +200,11 @@ int train(varnam *handle, char *pattern, char *word) int main(int argc, char *argv[]) { - struct arguments arguments; + struct arguments arguments = {NULL}; int rc; varnam *handle; char *msg; - arguments.symbols = NULL; - arguments.text = NULL; - arguments.learn = NULL; - argp_parse(&argp, argc, argv, 0, 0, &arguments); if (arguments.symbols == NULL) { From 31704b49f248024afa6a858d06acb6d1d907f7fa Mon Sep 17 00:00:00 2001 From: Subin Siby Date: Thu, 17 Sep 2020 02:35:55 +0530 Subject: [PATCH 07/17] Add --version (-v) --- varnamc.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/varnamc.c b/varnamc.c index 88d358f..6128662 100644 --- a/varnamc.c +++ b/varnamc.c @@ -10,6 +10,7 @@ static struct argp_option options[] = { { "text", 't', "TEXT", 0, "Transliterate the given text"}, { "learn", 'n', "TEXT", 0, "Learn the given text"}, { "train", 'a', "PATTERN=WORD", 0, "Train the given text"}, + { "version", 'v', "", OPTION_ARG_OPTIONAL, "Display version"}, { 0 } }; @@ -18,6 +19,7 @@ struct arguments { char *text; char *learn; char *train; + bool *version; }; static error_t parse_opt(int key, char *arg, struct argp_state *state) { @@ -27,8 +29,9 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state) { case 't': arguments->text = arg; break; case 'n': arguments->learn = arg; break; case 'a': arguments->train = arg; break; + case 'v': arguments->version = true; break; case ARGP_KEY_ARG: return 0; - default: return ARGP_ERR_UNKNOWN; + default: ARGP_ERR_UNKNOWN; } return 0; } @@ -207,9 +210,14 @@ int main(int argc, char *argv[]) argp_parse(&argp, argc, argv, 0, 0, &arguments); - if (arguments.symbols == NULL) { + if (arguments.version) + { + printf("libvarnam version %s", varnam_version()); + exit(0); + } else if (arguments.symbols == NULL) + { printf("varnamc : Can't load symbols file. Use --symbols option to specify the symbols file"); - return 0; + exit(0); } /* Initialization */ @@ -219,7 +227,7 @@ int main(int argc, char *argv[]) { printf("Initialization failed. Reason - %s", msg); free (msg); - return 1; + exit(1); } if (arguments.text != NULL) From 262e9e83e53c3a4f279932cb10c55d5d58b66fb9 Mon Sep 17 00:00:00 2001 From: Subin Siby Date: Thu, 17 Sep 2020 02:48:18 +0530 Subject: [PATCH 08/17] Fix naming of transliterate. --transliterate or -t --- varnamc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/varnamc.c b/varnamc.c index 6128662..30a7f75 100644 --- a/varnamc.c +++ b/varnamc.c @@ -7,7 +7,7 @@ static char doc[] = "an Indic language transliteration library"; static char args_doc[] = ""; static struct argp_option options[] = { { "symbols", 's', "VALUE", 0, "Sets the symbols file"}, - { "text", 't', "TEXT", 0, "Transliterate the given text"}, + { "transliterate", 't', "TEXT", 0, "Transliterate the given text"}, { "learn", 'n', "TEXT", 0, "Learn the given text"}, { "train", 'a', "PATTERN=WORD", 0, "Train the given text"}, { "version", 'v', "", OPTION_ARG_OPTIONAL, "Display version"}, @@ -16,17 +16,17 @@ static struct argp_option options[] = { struct arguments { char *symbols; - char *text; + char *transliterate; char *learn; char *train; - bool *version; + bool version; }; static error_t parse_opt(int key, char *arg, struct argp_state *state) { struct arguments *arguments = state->input; switch (key) { case 's': arguments->symbols = arg; break; - case 't': arguments->text = arg; break; + case 't': arguments->transliterate = arg; break; case 'n': arguments->learn = arg; break; case 'a': arguments->train = arg; break; case 'v': arguments->version = true; break; @@ -230,9 +230,9 @@ int main(int argc, char *argv[]) exit(1); } - if (arguments.text != NULL) + if (arguments.transliterate != NULL) { - transliterate(handle, arguments.text); + transliterate(handle, arguments.transliterate); } else if (arguments.learn != NULL) { learn(handle, arguments.learn); From 110c97f879ae607ffc946365a76e139f13b03869 Mon Sep 17 00:00:00 2001 From: Subin Siby Date: Thu, 17 Sep 2020 03:02:00 +0530 Subject: [PATCH 09/17] Add --reverse-transliterate (-r) --- varnamc.c | 36 +++++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/varnamc.c b/varnamc.c index 30a7f75..64589a7 100644 --- a/varnamc.c +++ b/varnamc.c @@ -8,6 +8,7 @@ static char args_doc[] = ""; static struct argp_option options[] = { { "symbols", 's', "VALUE", 0, "Sets the symbols file"}, { "transliterate", 't', "TEXT", 0, "Transliterate the given text"}, + { "reverse-transliterate", 'r', "TEXT", 0, "Reverse transliterate the given text"}, { "learn", 'n', "TEXT", 0, "Learn the given text"}, { "train", 'a', "PATTERN=WORD", 0, "Train the given text"}, { "version", 'v', "", OPTION_ARG_OPTIONAL, "Display version"}, @@ -17,6 +18,7 @@ static struct argp_option options[] = { struct arguments { char *symbols; char *transliterate; + char *reverse_transliterate; char *learn; char *train; bool version; @@ -27,6 +29,7 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state) { switch (key) { case 's': arguments->symbols = arg; break; case 't': arguments->transliterate = arg; break; + case 'r': arguments->reverse_transliterate = arg; break; case 'n': arguments->learn = arg; break; case 'a': arguments->train = arg; break; case 'v': arguments->version = true; break; @@ -113,8 +116,7 @@ int split( char * str, char delim, char ***array ) { * ----- */ -static void -print_transliteration_output(const char *pattern, varray *words) +void print_transliteration_output(varray *words) { int i; vword *word; @@ -128,7 +130,7 @@ print_transliteration_output(const char *pattern, varray *words) /** * Transliterate a word */ -int transliterate(varnam *handle, char *text) +void transliterate(varnam *handle, char *text) { int rc; varray *words; @@ -142,14 +144,35 @@ int transliterate(varnam *handle, char *text) varnam_destroy(handle); exit(1); } - print_transliteration_output ("malayalam", words); + print_transliteration_output (words); + exit(0); +} + +/** + * Reverse transliterate a word + */ +void reverse_transliterate(varnam *handle, char *text) +{ + int rc; + char *output; + + ensure_single_word(text); + + rc = varnam_reverse_transliterate (handle, text, &output); + if (rc != VARNAM_SUCCESS) + { + printf("%s", varnam_get_last_error(handle)); + varnam_destroy(handle); + exit(1); + } + printf("%s", output); exit(0); } /** * Learn a word */ -int learn(varnam *handle, char *word) +void learn(varnam *handle, char *word) { int rc; ensure_single_word(word); @@ -233,6 +256,9 @@ int main(int argc, char *argv[]) if (arguments.transliterate != NULL) { transliterate(handle, arguments.transliterate); + } else if (arguments.reverse_transliterate != NULL) + { + reverse_transliterate(handle, arguments.reverse_transliterate); } else if (arguments.learn != NULL) { learn(handle, arguments.learn); From 5ebd0fcfc6242e911a12dc14fa8dd394dc184790 Mon Sep 17 00:00:00 2001 From: Subin Siby Date: Thu, 17 Sep 2020 03:05:45 +0530 Subject: [PATCH 10/17] Prettifiying some code --- varnamc.c | 55 +++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 20 deletions(-) diff --git a/varnamc.c b/varnamc.c index 64589a7..492ae41 100644 --- a/varnamc.c +++ b/varnamc.c @@ -6,13 +6,13 @@ static char doc[] = "an Indic language transliteration library"; static char args_doc[] = ""; static struct argp_option options[] = { - { "symbols", 's', "VALUE", 0, "Sets the symbols file"}, - { "transliterate", 't', "TEXT", 0, "Transliterate the given text"}, - { "reverse-transliterate", 'r', "TEXT", 0, "Reverse transliterate the given text"}, - { "learn", 'n', "TEXT", 0, "Learn the given text"}, - { "train", 'a', "PATTERN=WORD", 0, "Train the given text"}, - { "version", 'v', "", OPTION_ARG_OPTIONAL, "Display version"}, - { 0 } + {"symbols", 's', "VALUE", 0, "Sets the symbols file"}, + {"transliterate", 't', "TEXT", 0, "Transliterate the given text"}, + {"reverse-transliterate", 'r', "TEXT", 0, "Reverse transliterate the given text"}, + {"learn", 'n', "TEXT", 0, "Learn the given text"}, + {"train", 'a', "PATTERN=WORD", 0, "Train the given text"}, + {"version", 'v', "", OPTION_ARG_OPTIONAL, "Display version"}, + {0} }; struct arguments { @@ -27,15 +27,29 @@ struct arguments { static error_t parse_opt(int key, char *arg, struct argp_state *state) { struct arguments *arguments = state->input; switch (key) { - case 's': arguments->symbols = arg; break; - case 't': arguments->transliterate = arg; break; - case 'r': arguments->reverse_transliterate = arg; break; - case 'n': arguments->learn = arg; break; - case 'a': arguments->train = arg; break; - case 'v': arguments->version = true; break; - case ARGP_KEY_ARG: return 0; - default: ARGP_ERR_UNKNOWN; - } + case 's': + arguments->symbols = arg; + break; + case 't': + arguments->transliterate = arg; + break; + case 'r': + arguments->reverse_transliterate = arg; + break; + case 'n': + arguments->learn = arg; + break; + case 'a': + arguments->train = arg; + break; + case 'v': + arguments->version = true; + break; + case ARGP_KEY_ARG: + return 0; + default: + ARGP_ERR_UNKNOWN; + } return 0; } @@ -80,7 +94,8 @@ void ensure_single_word(char *text) * set to NULL since it's valueless. * Modifies str so make a copy if this is a problem */ -int split( char * str, char delim, char ***array ) { +int split( char * str, char delim, char ***array ) +{ char *p; char **res; int count=0; @@ -120,10 +135,10 @@ void print_transliteration_output(varray *words) { int i; vword *word; - for (i = 0; i < varray_length (words); i++) + for (i = 0; i < varray_length(words); i++) { - word = varray_get (words, i); - printf (" %s. Confidence %d\n", word->text, word->confidence); + word = varray_get(words, i); + printf ("%s. Confidence %d\n", word->text, word->confidence); } } From f775edd640fe4b23b891fa80a3ecbce4543cecab Mon Sep 17 00:00:00 2001 From: Subin Siby Date: Thu, 17 Sep 2020 03:16:10 +0530 Subject: [PATCH 11/17] Add an --info (-i) flag to get confidence value in transliteration output. By default it should be just text --- varnamc.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/varnamc.c b/varnamc.c index 492ae41..b416823 100644 --- a/varnamc.c +++ b/varnamc.c @@ -8,6 +8,7 @@ static char args_doc[] = ""; static struct argp_option options[] = { {"symbols", 's', "VALUE", 0, "Sets the symbols file"}, {"transliterate", 't', "TEXT", 0, "Transliterate the given text"}, + {"info", 'i', "", OPTION_ARG_OPTIONAL, "Detailed transliteration output. Use with -t"}, {"reverse-transliterate", 'r', "TEXT", 0, "Reverse transliterate the given text"}, {"learn", 'n', "TEXT", 0, "Learn the given text"}, {"train", 'a', "PATTERN=WORD", 0, "Train the given text"}, @@ -18,6 +19,7 @@ static struct argp_option options[] = { struct arguments { char *symbols; char *transliterate; + bool info; char *reverse_transliterate; char *learn; char *train; @@ -33,6 +35,9 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state) { case 't': arguments->transliterate = arg; break; + case 'i': + arguments->info = true; + break; case 'r': arguments->reverse_transliterate = arg; break; @@ -131,21 +136,28 @@ int split( char * str, char delim, char ***array ) * ----- */ -void print_transliteration_output(varray *words) +void print_transliteration_output(varray *words, bool info) { int i; vword *word; for (i = 0; i < varray_length(words); i++) { + if (i != 0) + printf("\n"); + word = varray_get(words, i); - printf ("%s. Confidence %d\n", word->text, word->confidence); + if (info) { + printf ("%s. Confidence %d", word->text, word->confidence); + } else { + printf ("%s", word->text); + } } } /** * Transliterate a word */ -void transliterate(varnam *handle, char *text) +void transliterate(varnam *handle, char *text, bool info) { int rc; varray *words; @@ -159,7 +171,7 @@ void transliterate(varnam *handle, char *text) varnam_destroy(handle); exit(1); } - print_transliteration_output (words); + print_transliteration_output (words, info); exit(0); } @@ -270,7 +282,7 @@ int main(int argc, char *argv[]) if (arguments.transliterate != NULL) { - transliterate(handle, arguments.transliterate); + transliterate(handle, arguments.transliterate, arguments.info); } else if (arguments.reverse_transliterate != NULL) { reverse_transliterate(handle, arguments.reverse_transliterate); From 5d79ed259d4eb515e93b898c66cc7262f315e5e8 Mon Sep 17 00:00:00 2001 From: Subin Siby Date: Thu, 17 Sep 2020 19:27:03 +0530 Subject: [PATCH 12/17] Add --learn-from (-f) --- varnamc.c | 44 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/varnamc.c b/varnamc.c index b416823..749a00d 100644 --- a/varnamc.c +++ b/varnamc.c @@ -8,9 +8,10 @@ static char args_doc[] = ""; static struct argp_option options[] = { {"symbols", 's', "VALUE", 0, "Sets the symbols file"}, {"transliterate", 't', "TEXT", 0, "Transliterate the given text"}, - {"info", 'i', "", OPTION_ARG_OPTIONAL, "Detailed transliteration output. Use with -t"}, + {"info", 'i', "", OPTION_ARG_OPTIONAL, "Detailed transliteration output. Use with --transliterate"}, {"reverse-transliterate", 'r', "TEXT", 0, "Reverse transliterate the given text"}, {"learn", 'n', "TEXT", 0, "Learn the given text"}, + {"learn-from", 'f', "FILE", 0, "Reads from the specified file"}, {"train", 'a', "PATTERN=WORD", 0, "Train the given text"}, {"version", 'v', "", OPTION_ARG_OPTIONAL, "Display version"}, {0} @@ -22,6 +23,7 @@ struct arguments { bool info; char *reverse_transliterate; char *learn; + char *learn_from; char *train; bool version; }; @@ -44,6 +46,9 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state) { case 'n': arguments->learn = arg; break; + case 'f': + arguments->learn_from = arg; + break; case 'a': arguments->train = arg; break; @@ -215,6 +220,40 @@ void learn(varnam *handle, char *word) exit(0); } +int learn_counter = 0; +int learn_passed_counter = 0; +int learn_failed_counter = 0; +void learn_callback(varnam *handle, const char *word, int status, void *data) +{ + if (status == VARNAM_SUCCESS) + { + learn_passed_counter++; + } else + { + printf("Failed to learn %s : %s\n", word, varnam_get_last_error(handle)); + learn_failed_counter++; + } + learn_counter++; +} + +/** + * Learn words from a file + */ +void learn_from(varnam *handle, char *file_path) +{ + int rc; + + rc = varnam_learn_from_file (handle, file_path, NULL, learn_callback, NULL); + if (rc != VARNAM_SUCCESS) + { + printf("%s", varnam_get_last_error(handle)); + varnam_destroy(handle); + exit(1); + } + printf("Processed %d word(s). %d word(s) passed. %d word(s) failed.", learn_counter, learn_passed_counter, learn_failed_counter); + exit(0); +} + /** * Train a word */ @@ -289,6 +328,9 @@ int main(int argc, char *argv[]) } else if (arguments.learn != NULL) { learn(handle, arguments.learn); + } else if (arguments.learn_from != NULL) + { + learn_from(handle, arguments.learn_from); } else if (arguments.train != NULL) { char **tokens; From 6c0ecd2e81e5870b0b6cbf891cababbdf8b61051 Mon Sep 17 00:00:00 2001 From: Subin Siby Date: Thu, 17 Sep 2020 20:01:35 +0530 Subject: [PATCH 13/17] Rename --info (-d) to --details (-d) --- varnamc.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/varnamc.c b/varnamc.c index 749a00d..c3515de 100644 --- a/varnamc.c +++ b/varnamc.c @@ -8,7 +8,7 @@ static char args_doc[] = ""; static struct argp_option options[] = { {"symbols", 's', "VALUE", 0, "Sets the symbols file"}, {"transliterate", 't', "TEXT", 0, "Transliterate the given text"}, - {"info", 'i', "", OPTION_ARG_OPTIONAL, "Detailed transliteration output. Use with --transliterate"}, + {"details", 'd', "", OPTION_ARG_OPTIONAL, "Detailed transliteration output. Use with --transliterate"}, {"reverse-transliterate", 'r', "TEXT", 0, "Reverse transliterate the given text"}, {"learn", 'n', "TEXT", 0, "Learn the given text"}, {"learn-from", 'f', "FILE", 0, "Reads from the specified file"}, @@ -20,7 +20,7 @@ static struct argp_option options[] = { struct arguments { char *symbols; char *transliterate; - bool info; + bool details; char *reverse_transliterate; char *learn; char *learn_from; @@ -37,8 +37,8 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state) { case 't': arguments->transliterate = arg; break; - case 'i': - arguments->info = true; + case 'd': + arguments->details = true; break; case 'r': arguments->reverse_transliterate = arg; @@ -141,7 +141,7 @@ int split( char * str, char delim, char ***array ) * ----- */ -void print_transliteration_output(varray *words, bool info) +void print_transliteration_output(varray *words, bool details) { int i; vword *word; @@ -151,7 +151,7 @@ void print_transliteration_output(varray *words, bool info) printf("\n"); word = varray_get(words, i); - if (info) { + if (details) { printf ("%s. Confidence %d", word->text, word->confidence); } else { printf ("%s", word->text); @@ -321,7 +321,7 @@ int main(int argc, char *argv[]) if (arguments.transliterate != NULL) { - transliterate(handle, arguments.transliterate, arguments.info); + transliterate(handle, arguments.transliterate, arguments.details); } else if (arguments.reverse_transliterate != NULL) { reverse_transliterate(handle, arguments.reverse_transliterate); From f3b21bf2ec53b4545c7b66aea71ac5fe764081d2 Mon Sep 17 00:00:00 2001 From: Subin Siby Date: Thu, 17 Sep 2020 20:17:23 +0530 Subject: [PATCH 14/17] Add --import-learnings-from (-i) --- varnamc.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/varnamc.c b/varnamc.c index c3515de..7054ded 100644 --- a/varnamc.c +++ b/varnamc.c @@ -13,6 +13,7 @@ static struct argp_option options[] = { {"learn", 'n', "TEXT", 0, "Learn the given text"}, {"learn-from", 'f', "FILE", 0, "Reads from the specified file"}, {"train", 'a', "PATTERN=WORD", 0, "Train the given text"}, + {"import-learnings-from", 'i', "FILE", 0, "Import learned data from the specified file"}, {"version", 'v', "", OPTION_ARG_OPTIONAL, "Display version"}, {0} }; @@ -25,6 +26,7 @@ struct arguments { char *learn; char *learn_from; char *train; + char *import_learnings_from; bool version; }; @@ -52,6 +54,9 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state) { case 'a': arguments->train = arg; break; + case 'i': + arguments->import_learnings_from = arg; + break; case 'v': arguments->version = true; break; @@ -290,6 +295,27 @@ int train(varnam *handle, char *pattern, char *word) exit(0); } +/** + * Import learnings from a file + */ +void import_learnings_from(varnam *handle, char *file_path) +{ + int rc; + + printf("Importing: %s\n", file_path); + + rc = varnam_import_learnings_from_file(handle, file_path); + if (rc != VARNAM_SUCCESS) + { + const char *error_message = varnam_get_last_error(handle); + printf("%s", error_message); + exit(1); + } + + printf("Done"); + exit(0); +} + int main(int argc, char *argv[]) { struct arguments arguments = {NULL}; @@ -343,6 +369,9 @@ int main(int argc, char *argv[]) { printf("varnamc : Incorrect arguments"); } + } else if (arguments.import_learnings_from != NULL) + { + import_learnings_from(handle, arguments.import_learnings_from); } /* 0 means program executed successfully */ From 1a731e2e1f50bcb4819b537b24e72aab962ddcfd Mon Sep 17 00:00:00 2001 From: Subin Siby Date: Mon, 19 Oct 2020 20:18:40 +0530 Subject: [PATCH 15/17] Add --export-full --- varnamc.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/varnamc.c b/varnamc.c index 7054ded..e5a3b14 100644 --- a/varnamc.c +++ b/varnamc.c @@ -14,6 +14,7 @@ static struct argp_option options[] = { {"learn-from", 'f', "FILE", 0, "Reads from the specified file"}, {"train", 'a', "PATTERN=WORD", 0, "Train the given text"}, {"import-learnings-from", 'i', "FILE", 0, "Import learned data from the specified file"}, + {"export-full", 'e', "FILE", 0, "Export words and patterns to the specified directory"}, {"version", 'v', "", OPTION_ARG_OPTIONAL, "Display version"}, {0} }; @@ -27,6 +28,7 @@ struct arguments { char *learn_from; char *train; char *import_learnings_from; + char *export_full; bool version; }; @@ -57,6 +59,9 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state) { case 'i': arguments->import_learnings_from = arg; break; + case 'e': + arguments->export_full = arg; + break; case 'v': arguments->version = true; break; @@ -316,6 +321,38 @@ void import_learnings_from(varnam *handle, char *file_path) exit(0); } +void export_callback(int total_words, int total_processed, const char* current_word) +{ + float progress = (float) total_processed / total_words * 100; + printf("\rExporting %d%%", (int) progress); +} + +/** + * Export words & patterns to a directory + */ +void export_full(varnam *handle, char *dir_path) +{ + int rc; + + if (!is_directory(dir_path)) { + printf("varnamc : Output directory not found"); + exit(1); + } + + printf("Exporting words from '%s' to '%s'\n", varnam_get_suggestions_file(handle), dir_path); + + rc = varnam_export_words(handle, 30000, dir_path, VARNAM_EXPORT_FULL, export_callback); + if (rc != VARNAM_SUCCESS) + { + const char *error_message = varnam_get_last_error(handle); + printf("Export failed. %s\n", error_message); + exit(1); + } + + printf("\nExported words to %s", dir_path); + exit(0); +} + int main(int argc, char *argv[]) { struct arguments arguments = {NULL}; @@ -372,6 +409,9 @@ int main(int argc, char *argv[]) } else if (arguments.import_learnings_from != NULL) { import_learnings_from(handle, arguments.import_learnings_from); + } else if (arguments.export_full != NULL) + { + export_full(handle, arguments.export_full); } /* 0 means program executed successfully */ From 578abccd22a0a6bcf0bf956a877ce1a2018b1f7d Mon Sep 17 00:00:00 2001 From: Subin Siby Date: Mon, 19 Oct 2020 20:27:09 +0530 Subject: [PATCH 16/17] Install varnamc.rb to /bin too --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index ce4bf90..8a95784 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -190,6 +190,7 @@ INSTALL ( FILES ${CMAKE_BINARY_DIR}/varnamstatic.pc DESTINATION ${VARNAM_LIB_INS INSTALL ( FILES varnamruby.rb DESTINATION ${CMAKE_INSTALL_PREFIX}/lib) INSTALL_PROGRAMS(/bin FILES varnamc) INSTALL ( FILES varnamc.1 DESTINATION ${CMAKE_INSTALL_PREFIX}/share/man/man1) +INSTALL_PROGRAMS(/bin FILES varnamc.rb) # uninstall target configure_file( From 01d97f5cc79901bab6006b869d38bbf9e0f56c72 Mon Sep 17 00:00:00 2001 From: Subin Siby Date: Fri, 6 Nov 2020 16:10:45 +0530 Subject: [PATCH 17/17] Disable building varnamc in MSVC --- CMakeLists.txt | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8a95784..1ad284d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -209,7 +209,10 @@ else() target_link_libraries(${VARNAM_LIBRARY_NAME} pthread dl ${SQLITE3_LIBRARIES}) ENDIF() -FILE(GLOB_RECURSE SOURCES varnamc.c) -ADD_EXECUTABLE(${VARNAM_BINARY_NAME} ${SOURCES}) - -target_link_libraries(${VARNAM_BINARY_NAME} ${VARNAM_LIBRARY_NAME}) \ No newline at end of file +# argp doesn't by default exist for windows +# TODO make varnamc work on windows too +IF(!MSVC) + FILE(GLOB_RECURSE SOURCES varnamc.c) + ADD_EXECUTABLE(${VARNAM_BINARY_NAME} ${SOURCES}) + target_link_libraries(${VARNAM_BINARY_NAME} ${VARNAM_LIBRARY_NAME}) +ENDIF()