174 lines
6.4 KiB
C
174 lines
6.4 KiB
C
/*************************************************************************/
|
|
/* */
|
|
/* Language Technologies Institute */
|
|
/* Carnegie Mellon University */
|
|
/* Copyright (c) 2001 */
|
|
/* All Rights Reserved. */
|
|
/* */
|
|
/* Permission is hereby granted, free of charge, to use and distribute */
|
|
/* this software and its documentation without restriction, including */
|
|
/* without limitation the rights to use, copy, modify, merge, publish, */
|
|
/* distribute, sublicense, and/or sell copies of this work, and to */
|
|
/* permit persons to whom this work is furnished to do so, subject to */
|
|
/* the following conditions: */
|
|
/* 1. The code must retain the above copyright notice, this list of */
|
|
/* conditions and the following disclaimer. */
|
|
/* 2. Any modifications must be clearly marked as such. */
|
|
/* 3. Original authors' names are not deleted. */
|
|
/* 4. The authors' names are not used to endorse or promote products */
|
|
/* derived from this software without specific prior written */
|
|
/* permission. */
|
|
/* */
|
|
/* CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK */
|
|
/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
|
|
/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
|
|
/* SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE */
|
|
/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
|
|
/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
|
|
/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
|
|
/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
|
|
/* THIS SOFTWARE. */
|
|
/* */
|
|
/*************************************************************************/
|
|
/* Author: Alan W Black (awb@cs.cmu.edu) */
|
|
/* Date: January 2001 */
|
|
/*************************************************************************/
|
|
/* Feature functions used by various cart trees etc */
|
|
/* These have been create as needed, and as some of the trees are */
|
|
/* from University of Edinburgh's Festival system their names and */
|
|
/* semantics follow them */
|
|
/*************************************************************************/
|
|
|
|
#include "cst_hrg.h"
|
|
#include "cst_phoneset.h"
|
|
#include "cst_regex.h"
|
|
#include "cst_ffeatures.h"
|
|
#include "us_ffeatures.h"
|
|
|
|
static const cst_val *gpos(const cst_item *word);
|
|
|
|
DEF_STATIC_CONST_VAL_STRING(val_string_numeric,"numeric");
|
|
DEF_STATIC_CONST_VAL_STRING(val_string_number,"number");
|
|
DEF_STATIC_CONST_VAL_STRING(val_string_month,"month");
|
|
DEF_STATIC_CONST_VAL_STRING(val_string_day,"day");
|
|
DEF_STATIC_CONST_VAL_STRING(val_string_other,"_other_");
|
|
DEF_STATIC_CONST_VAL_STRING(val_string_a,"a");
|
|
DEF_STATIC_CONST_VAL_STRING(val_string_flight,"flight");
|
|
DEF_STATIC_CONST_VAL_STRING(val_string_to,"to");
|
|
|
|
DEF_STATIC_CONST_VAL_STRING(val_string_content,"content");
|
|
|
|
static const cst_val *gpos(const cst_item *word)
|
|
{
|
|
/* Guess at part of speech (function/content) */
|
|
const char *w;
|
|
int s,t;
|
|
|
|
w = item_feat_string(word,"name");
|
|
|
|
for (s=0; us_gpos[s]; s++)
|
|
{
|
|
for (t=1; us_gpos[s][t]; t++)
|
|
if (cst_streq(w,val_string(us_gpos[s][t])))
|
|
return us_gpos[s][0];
|
|
}
|
|
|
|
return (cst_val *)&val_string_content;
|
|
}
|
|
|
|
static const cst_val *num_digits(const cst_item *token)
|
|
{
|
|
const char *name = item_feat_string(token,"name");
|
|
|
|
return val_int_n(cst_strlen(name));
|
|
}
|
|
|
|
static const cst_val *month_range(const cst_item *token)
|
|
{
|
|
int v = item_feat_int(token,"name");
|
|
|
|
if ((v > 0) && ( v < 32))
|
|
return VAL_STRING_1;
|
|
else
|
|
return VAL_STRING_0;
|
|
}
|
|
|
|
static const cst_val *token_pos_guess(const cst_item *token)
|
|
{
|
|
const char *name = item_feat_string(token,"name");
|
|
char *dc = cst_downcase(name);
|
|
const cst_val *r;
|
|
|
|
if (cst_regex_match(cst_rx_digits,dc))
|
|
r = &val_string_numeric;
|
|
else if ((cst_regex_match(cst_rx_double,dc)) ||
|
|
(cst_regex_match(cst_rx_double,dc)))
|
|
r = &val_string_number;
|
|
else if (cst_streq(dc,"jan") ||
|
|
cst_streq(dc,"january") ||
|
|
cst_streq(dc,"feb") ||
|
|
cst_streq(dc,"february") ||
|
|
cst_streq(dc,"mar") ||
|
|
cst_streq(dc,"march") ||
|
|
cst_streq(dc,"apr") ||
|
|
cst_streq(dc,"april") ||
|
|
cst_streq(dc,"may") ||
|
|
cst_streq(dc,"jun") ||
|
|
cst_streq(dc,"june") ||
|
|
cst_streq(dc,"jul") ||
|
|
cst_streq(dc,"july") ||
|
|
cst_streq(dc,"aug") ||
|
|
cst_streq(dc,"august") ||
|
|
cst_streq(dc,"sep") ||
|
|
cst_streq(dc,"sept") ||
|
|
cst_streq(dc,"september") ||
|
|
cst_streq(dc,"oct") ||
|
|
cst_streq(dc,"october") ||
|
|
cst_streq(dc,"nov") ||
|
|
cst_streq(dc,"november") ||
|
|
cst_streq(dc,"dec") ||
|
|
cst_streq(dc,"december"))
|
|
r = &val_string_month;
|
|
else if (cst_streq(dc,"sun") ||
|
|
cst_streq(dc,"sunday") ||
|
|
cst_streq(dc,"mon") ||
|
|
cst_streq(dc,"monday") ||
|
|
cst_streq(dc,"tue") ||
|
|
cst_streq(dc,"tues") ||
|
|
cst_streq(dc,"tuesday") ||
|
|
cst_streq(dc,"wed") ||
|
|
cst_streq(dc,"wednesday") ||
|
|
cst_streq(dc,"thu") ||
|
|
cst_streq(dc,"thurs") ||
|
|
cst_streq(dc,"thursday") ||
|
|
cst_streq(dc,"fri") ||
|
|
cst_streq(dc,"friday") ||
|
|
cst_streq(dc,"sat") ||
|
|
cst_streq(dc,"saturday"))
|
|
r = &val_string_day;
|
|
/* ignoring the "token_most_common" condition, does get used */
|
|
else if (cst_streq(dc,"a"))
|
|
r = &val_string_a;
|
|
else if (cst_streq(dc,"flight"))
|
|
r = &val_string_flight;
|
|
else if (cst_streq(dc,"to"))
|
|
r = &val_string_to;
|
|
else
|
|
r = &val_string_other;
|
|
cst_free(dc);
|
|
return r;
|
|
}
|
|
|
|
void us_ff_register(cst_features *ffunctions)
|
|
{
|
|
|
|
/* The language independent ones */
|
|
basic_ff_register(ffunctions);
|
|
|
|
ff_register(ffunctions, "gpos",gpos);
|
|
ff_register(ffunctions, "num_digits",num_digits);
|
|
ff_register(ffunctions, "month_range",month_range);
|
|
ff_register(ffunctions, "token_pos_guess",token_pos_guess);
|
|
|
|
}
|