199 lines
7.5 KiB
C
199 lines
7.5 KiB
C
#include <ngram_model.h>
|
|
#include <logmath.h>
|
|
#include <strfuncs.h>
|
|
|
|
#include "test_macros.h"
|
|
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <math.h>
|
|
|
|
int
|
|
main(int argc, char *argv[])
|
|
{
|
|
logmath_t *lmath;
|
|
ngram_model_t *lms[3];
|
|
ngram_model_t *lmset;
|
|
const char *names[] = { "100", "102" };
|
|
const char *words[] = {
|
|
"<UNK>",
|
|
"ROBOMAN",
|
|
"libio",
|
|
"sphinxtrain",
|
|
"bigbird",
|
|
"quuxfuzz"
|
|
};
|
|
const int32 n_words = sizeof(words) / sizeof(words[0]);
|
|
float32 weights[] = { 0.6, 0.4 };
|
|
|
|
lmath = logmath_init(1.0001, 0, 0);
|
|
|
|
lms[0] = ngram_model_read(NULL, LMDIR "/100.lm.dmp", NGRAM_BIN, lmath);
|
|
lms[1] = ngram_model_read(NULL, LMDIR "/102.lm.dmp", NGRAM_BIN, lmath);
|
|
|
|
lmset = ngram_model_set_init(NULL, lms, (char **)names, NULL, 2);
|
|
TEST_ASSERT(lmset);
|
|
TEST_EQUAL(ngram_model_set_select(lmset, "102"), lms[1]);
|
|
TEST_EQUAL(ngram_model_set_select(lmset, "100"), lms[0]);
|
|
TEST_EQUAL(ngram_score(lmset, "sphinxtrain", NULL),
|
|
logmath_log10_to_log(lmath, -2.7884));
|
|
TEST_EQUAL(ngram_score(lmset, "huggins", "david", NULL),
|
|
logmath_log10_to_log(lmath, -0.0361));
|
|
TEST_EQUAL_LOG(ngram_score(lmset, "daines", "huggins", "david", NULL),
|
|
logmath_log10_to_log(lmath, -0.4105));
|
|
|
|
TEST_EQUAL(ngram_model_set_select(lmset, "102"), lms[1]);
|
|
TEST_EQUAL(ngram_score(lmset, "sphinxtrain", NULL),
|
|
logmath_log10_to_log(lmath, -2.8192));
|
|
TEST_EQUAL(ngram_score(lmset, "huggins", "david", NULL),
|
|
logmath_log10_to_log(lmath, -0.1597));
|
|
TEST_EQUAL_LOG(ngram_score(lmset, "daines", "huggins", "david", NULL),
|
|
logmath_log10_to_log(lmath, -0.0512));
|
|
|
|
/* Test interpolation with default weights. */
|
|
TEST_ASSERT(ngram_model_set_interp(lmset, NULL, NULL));
|
|
TEST_EQUAL_LOG(ngram_score(lmset, "sphinxtrain", NULL),
|
|
logmath_log(lmath,
|
|
0.5 * pow(10, -2.7884)
|
|
+ 0.5 * pow(10, -2.8192)));
|
|
|
|
/* Test interpolation with set weights. */
|
|
TEST_ASSERT(ngram_model_set_interp(lmset, names, weights));
|
|
TEST_EQUAL_LOG(ngram_score(lmset, "sphinxtrain", NULL),
|
|
logmath_log(lmath,
|
|
0.6 * pow(10, -2.7884)
|
|
+ 0.4 * pow(10, -2.8192)));
|
|
|
|
/* Test switching back to selected mode. */
|
|
TEST_EQUAL(ngram_model_set_select(lmset, "102"), lms[1]);
|
|
TEST_EQUAL(ngram_score(lmset, "sphinxtrain", NULL),
|
|
logmath_log10_to_log(lmath, -2.8192));
|
|
TEST_EQUAL(ngram_score(lmset, "huggins", "david", NULL),
|
|
logmath_log10_to_log(lmath, -0.1597));
|
|
TEST_EQUAL_LOG(ngram_score(lmset, "daines", "huggins", "david", NULL),
|
|
logmath_log10_to_log(lmath, -0.0512));
|
|
|
|
/* Test interpolation with previously set weights. */
|
|
TEST_ASSERT(ngram_model_set_interp(lmset, NULL, NULL));
|
|
TEST_EQUAL_LOG(ngram_score(lmset, "sphinxtrain", NULL),
|
|
logmath_log(lmath,
|
|
0.6 * pow(10, -2.7884)
|
|
+ 0.4 * pow(10, -2.8192)));
|
|
|
|
/* Test interpolation with closed-vocabulary models and OOVs. */
|
|
lms[2] = ngram_model_read(NULL, LMDIR "/turtle.lm", NGRAM_ARPA, lmath);
|
|
TEST_ASSERT(ngram_model_set_add(lmset, lms[2], "turtle", 1.0, FALSE));
|
|
TEST_EQUAL_LOG(ngram_score(lmset, "sphinxtrain", NULL),
|
|
logmath_log(lmath,
|
|
0.6 * (2.0 / 3.0) * pow(10, -2.7884)
|
|
+ 0.4 * (2.0 / 3.0) * pow(10, -2.8192)));
|
|
ngram_model_free(lmset);
|
|
|
|
/* Test adding and removing language models with preserved
|
|
* word ID mappings. */
|
|
lms[0] = ngram_model_read(NULL, LMDIR "/100.lm.dmp", NGRAM_BIN, lmath);
|
|
lms[1] = ngram_model_read(NULL, LMDIR "/102.lm.dmp", NGRAM_BIN, lmath);
|
|
lms[2] = ngram_model_read(NULL, LMDIR "/turtle.lm", NGRAM_ARPA, lmath);
|
|
lmset = ngram_model_set_init(NULL, lms, (char **)names, NULL, 1);
|
|
{
|
|
int32 wid;
|
|
wid = ngram_wid(lmset, "sphinxtrain");
|
|
TEST_ASSERT(ngram_model_set_add(lmset, lms[1], "102", 1.0, TRUE));
|
|
/* Verify that it is the same. */
|
|
TEST_EQUAL(wid, ngram_wid(lmset, "sphinxtrain"));
|
|
/* Now add another model and verify that its words
|
|
* don't actually get added. */
|
|
TEST_ASSERT(ngram_model_set_add(lmset, lms[2], "turtle", 1.0, TRUE));
|
|
TEST_EQUAL(wid, ngram_wid(lmset, "sphinxtrain"));
|
|
TEST_EQUAL(ngram_unknown_wid(lmset), ngram_wid(lmset, "FORWARD"));
|
|
/* Remove language model, make sure this doesn't break horribly. */
|
|
TEST_EQUAL(lms[1], ngram_model_set_remove(lmset, "102", TRUE));
|
|
ngram_model_free(lms[1]);
|
|
TEST_EQUAL(wid, ngram_wid(lmset, "sphinxtrain"));
|
|
/* Now enable remapping of word IDs and verify that it works. */
|
|
TEST_EQUAL(lms[2], ngram_model_set_remove(lmset, "turtle", TRUE));
|
|
TEST_ASSERT(ngram_model_set_add(lmset, lms[2], "turtle", 1.0, FALSE));
|
|
printf("FORWARD = %d\n", ngram_wid(lmset, "FORWARD"));
|
|
}
|
|
|
|
ngram_model_free(lmset);
|
|
|
|
/* Now test lmctl files. */
|
|
lmset = ngram_model_set_read(NULL, LMDIR "/100.lmctl", lmath);
|
|
TEST_ASSERT(lmset);
|
|
/* Test iterators. */
|
|
{
|
|
ngram_model_set_iter_t *itor;
|
|
ngram_model_t *lm;
|
|
char const *lmname;
|
|
|
|
itor = ngram_model_set_iter(lmset);
|
|
TEST_ASSERT(itor);
|
|
lm = ngram_model_set_iter_model(itor, &lmname);
|
|
printf("1: %s\n", lmname);
|
|
itor = ngram_model_set_iter_next(itor);
|
|
lm = ngram_model_set_iter_model(itor, &lmname);
|
|
printf("2: %s\n", lmname);
|
|
itor = ngram_model_set_iter_next(itor);
|
|
lm = ngram_model_set_iter_model(itor, &lmname);
|
|
printf("3: %s\n", lmname);
|
|
itor = ngram_model_set_iter_next(itor);
|
|
TEST_EQUAL(itor, NULL);
|
|
}
|
|
|
|
TEST_EQUAL(ngram_score(lmset, "sphinxtrain", NULL),
|
|
logmath_log10_to_log(lmath, -2.7884));
|
|
|
|
TEST_ASSERT(ngram_model_set_interp(lmset, NULL, NULL));
|
|
TEST_EQUAL_LOG(ngram_score(lmset, "sphinxtrain", NULL),
|
|
logmath_log(lmath,
|
|
(1.0 / 3.0) * pow(10, -2.7884)
|
|
+ (1.0 / 3.0) * pow(10, -2.8192)));
|
|
|
|
ngram_model_set_select(lmset, "102");
|
|
TEST_EQUAL(ngram_score(lmset, "sphinxtrain", NULL),
|
|
logmath_log10_to_log(lmath, -2.8192));
|
|
TEST_EQUAL(ngram_score(lmset, "huggins", "david", NULL),
|
|
logmath_log10_to_log(lmath, -0.1597));
|
|
TEST_EQUAL_LOG(ngram_score(lmset, "daines", "huggins", "david", NULL),
|
|
logmath_log10_to_log(lmath, -0.0512));
|
|
|
|
ngram_model_set_select(lmset, "100");
|
|
TEST_EQUAL(ngram_score(lmset, "sphinxtrain", NULL),
|
|
logmath_log10_to_log(lmath, -2.7884));
|
|
TEST_EQUAL(ngram_score(lmset, "huggins", "david", NULL),
|
|
logmath_log10_to_log(lmath, -0.0361));
|
|
TEST_EQUAL_LOG(ngram_score(lmset, "daines", "huggins", "david", NULL),
|
|
logmath_log10_to_log(lmath, -0.4105));
|
|
|
|
/* Test class probabilities. */
|
|
ngram_model_set_select(lmset, "100");
|
|
TEST_EQUAL_LOG(ngram_score(lmset, "scylla:scylla", NULL),
|
|
logmath_log10_to_log(lmath, -2.7884) + logmath_log(lmath, 0.4));
|
|
TEST_EQUAL_LOG(ngram_score(lmset, "scooby:scylla", NULL),
|
|
logmath_log10_to_log(lmath, -2.7884) + logmath_log(lmath, 0.1));
|
|
TEST_EQUAL_LOG(ngram_score(lmset, "apparently", "karybdis:scylla", NULL),
|
|
logmath_log10_to_log(lmath, -0.5172));
|
|
|
|
/* Test word ID mapping. */
|
|
ngram_model_set_select(lmset, "turtle");
|
|
TEST_EQUAL(ngram_wid(lmset, "ROBOMAN"),
|
|
ngram_wid(lmset, ngram_word(lmset, ngram_wid(lmset, "ROBOMAN"))));
|
|
TEST_EQUAL(ngram_wid(lmset, "bigbird"),
|
|
ngram_wid(lmset, ngram_word(lmset, ngram_wid(lmset, "bigbird"))));
|
|
TEST_EQUAL(ngram_wid(lmset, "quuxfuzz"), ngram_unknown_wid(lmset));
|
|
TEST_EQUAL(ngram_score(lmset, "quuxfuzz", NULL), ngram_zero(lmset));
|
|
ngram_model_set_map_words(lmset, words, n_words);
|
|
TEST_EQUAL(ngram_wid(lmset, "ROBOMAN"),
|
|
ngram_wid(lmset, ngram_word(lmset, ngram_wid(lmset, "ROBOMAN"))));
|
|
TEST_EQUAL(ngram_wid(lmset, "bigbird"),
|
|
ngram_wid(lmset, ngram_word(lmset, ngram_wid(lmset, "bigbird"))));
|
|
TEST_EQUAL(ngram_wid(lmset, "quuxfuzz"), 5);
|
|
TEST_EQUAL(ngram_score(lmset, "quuxfuzz", NULL), ngram_zero(lmset));
|
|
|
|
ngram_model_free(lmset);
|
|
logmath_free(lmath);
|
|
return 0;
|
|
}
|