161 lines
7.6 KiB
C
161 lines
7.6 KiB
C
/*************************************************************************/
|
|
/* */
|
|
/* Language Technologies Institute */
|
|
/* Carnegie Mellon University */
|
|
/* Copyright (c) 1999 */
|
|
/* All Rights Reserved. */
|
|
/* */
|
|
/* Permission is hereby granted, free of charge, to use and distribute */
|
|
/* this software and its documentation without restriction, including */
|
|
/* without limitation the rights to use, copy, modify, merge, publish, */
|
|
/* distribute, sublicense, and/or sell copies of this work, and to */
|
|
/* permit persons to whom this work is furnished to do so, subject to */
|
|
/* the following conditions: */
|
|
/* 1. The code must retain the above copyright notice, this list of */
|
|
/* conditions and the following disclaimer. */
|
|
/* 2. Any modifications must be clearly marked as such. */
|
|
/* 3. Original authors' names are not deleted. */
|
|
/* 4. The authors' names are not used to endorse or promote products */
|
|
/* derived from this software without specific prior written */
|
|
/* permission. */
|
|
/* */
|
|
/* CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK */
|
|
/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
|
|
/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
|
|
/* SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE */
|
|
/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
|
|
/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
|
|
/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
|
|
/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
|
|
/* THIS SOFTWARE. */
|
|
/* */
|
|
/*************************************************************************/
|
|
/* Author: Alan W Black (awb@cs.cmu.edu) */
|
|
/* Date: January 2000 */
|
|
/*************************************************************************/
|
|
/* */
|
|
/* cst front-end to Henry Spencer's regex code */
|
|
/* */
|
|
/*************************************************************************/
|
|
|
|
/* Includes portions or regexp.h, copyright follows: */
|
|
/*
|
|
* Copyright (c) 1986 by University of Toronto.
|
|
* Copyright (c) 1989, 1993
|
|
* The Regents of the University of California. All rights reserved.
|
|
*
|
|
* This code is derived from software contributed to Berkeley
|
|
* by Henry Spencer.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* 3. All advertising materials mentioning features or use of this software
|
|
* must display the following acknowledgement:
|
|
* This product includes software developed by the University of
|
|
* California, Berkeley and its contributors.
|
|
* 4. Neither the name of the University nor the names of its contributors
|
|
* may be used to endorse or promote products derived from this software
|
|
* without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
* SUCH DAMAGE.
|
|
*
|
|
* @(#)regexp.h 8.1 (Berkeley) 6/2/93
|
|
*/
|
|
|
|
#ifndef _CST_REGEX_H__
|
|
#define _CST_REGEX_H__
|
|
|
|
#include "cst_file.h"
|
|
#include "cst_string.h"
|
|
|
|
/*
|
|
* The first byte of the regexp internal "program" is actually this magic
|
|
* number; the start node begins in the second byte.
|
|
*/
|
|
#define CST_REGMAGIC 0234
|
|
|
|
typedef struct cst_regex_struct {
|
|
char regstart; /* Internal use only. */
|
|
char reganch; /* Internal use only. */
|
|
char *regmust; /* Internal use only. */
|
|
int regmlen; /* Internal use only. */
|
|
int regsize;
|
|
char *program;
|
|
} cst_regex;
|
|
|
|
#define CST_NSUBEXP 10
|
|
typedef struct cst_regstate_struct {
|
|
const char *startp[CST_NSUBEXP];
|
|
const char *endp[CST_NSUBEXP];
|
|
const char *input;
|
|
const char *bol;
|
|
} cst_regstate;
|
|
|
|
cst_regex *new_cst_regex(const char *str);
|
|
void delete_cst_regex(cst_regex *r);
|
|
|
|
int cst_regex_match(const cst_regex *r, const char *str);
|
|
cst_regstate *cst_regex_match_return(const cst_regex *r, const char *str);
|
|
|
|
/* Internal functions from original HS code */
|
|
cst_regex *hs_regcomp(const char *);
|
|
cst_regstate *hs_regexec(const cst_regex *, const char *);
|
|
void hs_regdelete(cst_regex *);
|
|
|
|
/* Works similarly to snprintf(3), in that at most max characters are
|
|
written to out, including the trailing NUL, and the return value is
|
|
the number of characters written, *excluding* the trailing NUL.
|
|
Also works similarly to wcstombs(3) in that passing NULL as out
|
|
will count the number of characters that would be written without
|
|
doing any actual conversion, and ignoring max. So, you could use
|
|
it like this:
|
|
|
|
rx = new_cst_regex("\\(.*\\)_\\(.*\\)");
|
|
if ((rs = cst_regex_match_return(rx, "foo_bar")) != NULL) {
|
|
size_t n;
|
|
|
|
n = cst_regsub(rs, "\\1_\\2_quux", NULL, 0) + 1;
|
|
out = cst_alloc(char, n);
|
|
cst_regsub(rs, "\\1_\\2_quux", out, n);
|
|
} */
|
|
size_t cst_regsub(const cst_regstate *r, const char *in, char *out, size_t max);
|
|
|
|
/* Initialize the regex engine and global regex constants */
|
|
void cst_regex_init();
|
|
|
|
/* Regexps used in text processing (these are latin-alphabet specific
|
|
and to some extent US English-specific) */
|
|
extern const cst_regex * const cst_rx_white;
|
|
extern const cst_regex * const cst_rx_alpha;
|
|
extern const cst_regex * const cst_rx_uppercase;
|
|
extern const cst_regex * const cst_rx_lowercase;
|
|
extern const cst_regex * const cst_rx_alphanum;
|
|
extern const cst_regex * const cst_rx_identifier;
|
|
extern const cst_regex * const cst_rx_int;
|
|
extern const cst_regex * const cst_rx_double;
|
|
extern const cst_regex * const cst_rx_commaint;
|
|
extern const cst_regex * const cst_rx_digits;
|
|
extern const cst_regex * const cst_rx_dotted_abbrev;
|
|
|
|
/* Table of regexps used in CART trees (only one so far) */
|
|
extern const cst_regex * const cst_regex_table[];
|
|
#define CST_RX_dotted_abbrev_NUM 0
|
|
|
|
#endif
|