Added sound change code and data

This commit is contained in:
Daniel Wolf 2016-06-03 10:37:47 +02:00
parent 8be6485685
commit bf19d267ee
3 changed files with 768 additions and 0 deletions

View File

@ -0,0 +1,9 @@
**The MIT License (MIT)**
Copyright (c) 2000 Mark Rosenfelder
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

235
lib/soundchange/english.sc Normal file
View File

@ -0,0 +1,235 @@
*
* Variables
*
* vowels, long, short
U=aeiou
V=aeiouäëïöüâêîôûùò@
L=äëïöüäëïöüäëïöüùò@
S=âêîôûâêîôûâêîôûùò@
A=aâä
E=eêë
I=iîï
O=oôö
&=eiou
* front
F=eiêîy
* any letter
X=bcdfghjklmnpqrstvwxyzç+$ñaeiouäëïöüâêîôûùò@
* consonants
C=bcdfghjklmnpqrstvwxyzç+$ñ
* dentals, liquids, nasals
D=td+
R=rl
M=mnñ
T=tdns+
* stops, fricatives (voiced and voiceless)
P=ptk
B=bdg
ß=fs$+
Z=vz#+
*
* Rules
*
* get rid of some digraphs
ch/ç/_
sh/$/_
ph/f/_
th/+/_
qu/kw/_
* and other spelling-level changes
w//_r
w//_ho
h//w_
h//#r_
h//x_
h//V_#
x/gz/#e_V
x/ks/_
'//_
* gh is particularly variable
gh/g/_V
V/L/C_gh
ough/ò/_t
augh/ò/_t
ough/ö/_
gh//_
* unpronounceable combinations
g//#_n
k//#_n
m//#_n
p//#_t
p//#_s
t//#_m
* medial y = i
y/ï/#C_#
y/ï/#CC_#
y/ï/#CCC_#
ey/ë/_
ay/ä/_
oy/öy/_
y/i/C_C
y/i/C_#
y/i/C_e#
ie/ï/CC_#
ie/ï/#C_#
* sSl can simplify
t//s_lV#
* affrication of t + front vowel
ci/$/X_V
ti/$/X_V
tu/çu/X_V
tu/çu/X_RV
si/$/C_o
si/j/V_o
s/$/C_ur
s/j/V_ur
s/$/k_uV
s/$/k_uR
* intervocalic s
s/z/&_V
* al to ol (do this before respelling)
a/ò/_ls
a/ò/_lr
a/ò/_ll#
a/ò/_lm(V)#
a/ò/C_lD
a/ò/#_lD
al/ò/X_k
* soft c and g
c/s/_F
c/k/_
ge/j/X_a
ge/j/X_o
g/j/_F
* init/final guF was there just to harden the g
gu/g/#_F
gu/g/_e#
* untangle reverse-written final liquids
re/@r/C_#
le/@l/C_#
* vowels are long medially
U/L/C_CV
U/L/#_CV
* and short before 2 consonants or a final one
U/S/C_CC
U/S/#_CC
U/S/C_C#
U/S/#_C#
* special but general rules
î/ï/_nd#
ô/ò/_ss#
ô/ò/_g#
ô/ò/_fC
ô/ö/_lD
â/ò/w_$
â/ò/w_(t)ç
â/ô/w_T
* soft gn
îg/ï/_M#
îg/ï/_MC
g//ei_n
* handle ous before removing -e
ou/@/_s#
ou/@/_sC
* remove silent -e
e//VC(C)(C)_#
* common suffixes that hide a silent e
ë//XXX_mênt#
ë//XXX_nêss#
ë//XXX_li#
ë//XXX_fûl#
* another common suffix
ï/ë/XXX_nêss#
* shorten (1-char) weak penults after a long
* note: this error breaks almost as many words as it fixes...
L/S/LC(C)(C)_CV#
* double vowels
eau/ö/_
ai/ä/_
au/ò/_
âw/ò/_
ee/ë/_
ea/ë/_
ei/ë/s_
ei/ä/_
eo/ë@/_
êw/ü/_
eu/ü/_
ie/ë/_
V/@/i_
i/ï/#C(C)_
i/ë/_@
oa/ö/_
oe/ö/_#
oo/ù/_k
oo/u/_
oul/ù/_d#
ou/ôw/_
oi/öy/_
ua/ü@/_
ue/u/_
ui/u/_
ôw/ö/_#
=* those pesky final syllabics
V/@/VC(V)_l#
ê/@/VC(C)_n#
î/@/VC(C)_n#
â/@/VC(C)_n#
ô/@/VC(C)_n#
* suffix simplifications
A/@/XXX_b@l#
ë/y/Xl_@n#
ë/y/Xn_@n#
* unpronounceable finals
b//m_#
n//m_#
* color the final vowels
a/@/_#
e/ë/_#
i/ë/_#
o/ö/_#
* vowels before r V=aeiouäëïöüâêîôûùò@
ôw/ö/_rX
ô/ö/_r
ò/ö/_r
â/ö/w_rC
â/ö/w_r#
ê/ä/_rr
ë/ä/_rIC
â/ä/_rr
â/ô/_rC
â/ô/_r#
â/ä/_r
ê/@/_r
î/@/_r
û/@/_r
ù/@/_r
* handle ng
ng/ñ/
ng/ñ/_B
ng/ñ/_P
ng/ñ/_#
n/ñ/_g
n/ñ/_k
ô/ò/
â/ä/
* really a morphophonological rule, but it's cute
s/z/B_#
s/z/_m#
* double consonants
s//_s
s//_$
t//_t
t//
p//_p
k//_k
b//_b
d//_d
d//_j
g//_g
n//_n
m//_m
r//_r
l//_l
f//_f
z//_z

524
lib/soundchange/sounds.c Normal file
View File

@ -0,0 +1,524 @@
/*
** SOUNDS.C
**
** Sound Change Applier
**
** Copyright (C) 2000 by Mark Rosenfelder.
** This program may be freely used and modified for non-commercial purposes.
** See http://www.zompist.com/sounds.htm for documentation.
*/
#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
#include <math.h>
#define TRUE 1
#define FALSE 0
static int printRules = 0;
static int bracketOut = 0;
static int printSourc = 1;
static int toScreen = 1;
#define MAXRULE 200
#define MAXCAT 50
static int nRule = 0;
static char *Rule[MAXRULE];
static int nCat = 0;
static char *Cat[MAXCAT];
/*
** ReadRules
**
** Read in the rules file *.sc for a given project.
**
** There are two types of rules: sound changes and category definitions.
** The former are stored in Rule[], the latter in Cat[].
**
** The format of these rules is given under Transform().
*/
int ReadRules( char *filestart )
{
char filename[84];
char buffer[129];
char *s;
int n;
FILE *f;
nRule = 0;
nCat = 0;
/* Open the file */
sprintf( filename, "%s.sc", filestart );
f = fopen( filename, "r" );
if (!f)
{
printf( "File %s could not be read in.\n\n", filename );
return(FALSE);
}
while (fgets( buffer, 129, f))
{
if (strlen(buffer))
buffer[strlen(buffer)-1] = '\0';
s = malloc( strlen(buffer) + 1);
if (s)
strcpy( s, buffer );
if (buffer[0] != '*')
{
if (strchr( buffer, '/' ))
Rule[nRule++] = s;
else if (strchr( buffer, '='))
Cat[ nCat++] = s;
}
}
fclose(f);
if (nCat)
{
printf( "%i categories found\n", nCat );
#ifdef PRINT_RULES
for (n = 0; n < nCat; n++)
printf( "%s\n", Cat[n] );
printf( "\n" );
#endif
}
else
printf( "No rules were found.\n\n" );
if (nRule)
{
printf( "%i rules found\n", nRule );
#ifdef PRINT_RULES
for (n = 0; n < nRule; n++)
printf( "%s\n", Rule[n] );
printf( "\n" );
#endif
}
else
printf( "No rules were found.\n\n" );
return( nRule );
} /*ReadRules*/
/*
** Divide
**
** Divide a rule into source and target phoneme(s) and environment.
** That is, for a rule s1/s2/env
** create the three null-terminated strings s1, s2, and env.
**
** If this cannot be done, return FALSE.
*/
int Divide( char *Rule, char **s1, char **s2, char **env )
{
size_t i;
static char s1_str[20];
static char s2_str[20];
static char ev_str[50];
i = strcspn( Rule, "/" );
if (i == 0 || i > 19)
return(FALSE);
strncpy( s1_str, Rule, i );
s1_str[i] = '\0';
Rule += i + 1;
i = strcspn( Rule, "/" );
if (i > 19)
return(FALSE);
if (i)
strncpy( s2_str, Rule, i );
s2_str[i] = '\0';
Rule += i + 1;
strcpy( ev_str, Rule );
*s1 = s1_str;
*s2 = s2_str;
*env = ev_str;
return(TRUE);
} /*Divide*/
/*
** TryCat
**
** See if a particular phoneme sequence is part of any category.
** (We try all the categories.)
**
** For instance, if we have 'a' in the source word and 'V' in the
** structural description, and a category V=aeiou, TryCat returns TRUE,
** and sets *n to the number of characters to skip.
**
** If we had 'b' instead, TryCat would return FALSE instead.
**
** If no category with the given identification (env) can be found,
** we return TRUE (continue looking), but set *n to 0.
**
** Warning: For now, we don't have a way to handle digraphs.
**
** We also return TRUE if
*/
int TryCat( char *env, char *word, int *n, int *catLoc )
{
int c;
char *catdef;
if (*word == '\0')
return(FALSE);
for (c = 0; c < nCat; c++)
{
if (*env == *Cat[c])
{
catdef = strchr( Cat[c], '=' );
if (strchr( catdef + 1, word[0] ))
{
*n = 1;
*catLoc = strchr( Cat[c], word[0] ) - Cat[c];
return(TRUE);
}
else
return(FALSE);
}
}
*n = 0;
return(TRUE);
} /*TryCat*/
/*
** TryRule
**
** See if a rule s1->s2/env applies at position i in the given word.
**
** If it does, we pass back the index where s1 was found in the
** word, as well as s1 and s2, and return TRUE.
**
** Otherwise, we return FALSE, and pass garbage in the output variables.
*/
int TryRule( char *word, int i, char *Rule, int *n, char **s1, char **s2, char *varRep )
{
int j, m, cont = 0;
int catLoc;
char *env;
int optional = FALSE;
*varRep = '\0';
if (!Divide( Rule, s1, s2, &env ) || !strchr( env, '_' ))
return(FALSE);
for (j = 0, cont = TRUE; cont && j < strlen(env); j++)
{
switch( env[j] )
{
case '(':
optional = TRUE;
break;
case ')':
optional = FALSE;
break;
case '#':
cont = j ? (i == strlen(word)) : (i == 0);
break;
case '_':
cont = !strncmp( &word[i], *s1, strlen(*s1) );
if (cont)
{
*n = i;
i += strlen(*s1);
}
else
{
cont = TryCat( *s1, &word[i], &m, &catLoc );
if (cont && m)
{
int c;
*n = i;
i += m;
for (c = 0; c < nCat; c++)
if ((*s2)[0] == Cat[c][0] && catLoc < strlen(Cat[c]))
*varRep = Cat[c][catLoc];
}
else if (cont)
cont = FALSE;
}
break;
default:
cont = TryCat( &env[j], &word[i], &m, &catLoc );
if (cont && !m)
{
/* no category applied */
cont = i < strlen(word) && word[i] == env[j];
m = 1;
}
if (cont)
i += m;
if (!cont && optional)
cont = TRUE;
}
}
if (cont && printRules)
printf( " %s->%s /%s applies to %s at %i\n",
*s1, *s2, env, word, *n );
return(cont);
} /*TryRule*/
/*
** Transform
**
** Apply the rules to a single word and return the result.
**
** The rules are stated in the form string1/string2/environment, e.g.
** f/h/#_V
** which states that f changes to h at the beginning of a word before a
** vowel.
*/
char *Transform( char *input )
{
char inword[80];
static char outword[80];
char instr[10];
char *s1, *s2;
int i;
int r;
int n;
strcpy( inword, input );
/* Try to apply each rule in turn */
for (r = 0; r < nRule; r++)
{
/* Initialize output of this rule to null */
memset( outword, 0, 80 );
/* Check each position of the input word in turn */
i = 0;
while (i < strlen(inword))
{
char varRep = 0;
if (TryRule( inword, i, Rule[r], &n, &s1, &s2, &varRep ))
{
/* Rule applies at inword[n] */
if (n)
strncat( outword, &inword[i], n - i );
if (varRep)
outword[strlen(outword)] = varRep;
else if (strlen(s2))
strcat( outword, s2 );
i = n + strlen(s1);
}
else
{
/* Rule doesn't apply at this location */
outword[strlen(outword)] = inword[i++];
}
}
/* Output of one rule is input to next one */
strcpy( inword, outword );
}
/* Return the output of the last rule */
return(outword);
} /*Transform*/
/*
** DoWords
**
** Read in each word in turn from the input file,
** transform it according to the rules,
** and output it to the output file.
**
** This algorithm ensures that word files of any size can be processed.
*/
void DoWords( char *lexname, char *outname )
{
char filename[84];
char inword[84];
int n = 0;
FILE *f, *g;
char *outword;
sprintf( filename, "%s.lex", lexname );
f = fopen( filename, "r" );
if (!f)
{
printf( "File %s could not be read in.\n\n", filename );
return;
}
sprintf( filename, "%s.out", outname );
g = fopen( filename, "w" );
if (!g)
{
printf( "File %s could not be created.\n\n", filename );
fclose(f);
return;
}
while (fgets( inword, 129, f))
{
n++;
if (strlen(inword))
inword[strlen(inword) - 1] = '\0';
outword = Transform(inword);
if (!printSourc)
{
if (toScreen)
printf( "%s\n", outword );
fprintf( g, "%s\n", outword );
}
else if (bracketOut)
{
if (toScreen)
printf( "%s \t[%s]\n", outword, inword );
fprintf( g, "%s \t[%s]\n", outword, inword );
}
else
{
if (toScreen)
printf( "%s --> %s\n", inword, outword );
fprintf( g, "%s --> %s\n", inword, outword );
}
}
fclose(f);
fclose(g);
printf( "%i word%s processed.\n", n, n == 1 ? "" : "s" );
} /*DoWords*/
/*
** MAIN ROUTINE
**
** Ask for name of project
** Read in rules and input words
** Apply transformations
** Output words
**
*/
main( int argc, char **argv )
{
int once = FALSE;
char lexicon[65] = "\0";
char rules[65] = "\0";
/* Read command line arguments */
int i;
for (i = 1; i < argc; i++)
{
if (argv[i][0] == '-' && strlen(argv[i]) > 1)
{
switch (argv[i][1])
{
case 'p': case 'P': printRules = 1; break;
case 'b': case 'B': bracketOut = 1; break;
case 'l': case 'L': printSourc = 0; break;
case 'f': case 'F': toScreen = 0; break;
}
}
else if (!lexicon[0])
strcpy( lexicon, argv[i] );
else
strcpy( rules, argv[i] );
}
once = lexicon[0] && rules[0];
printf( "\nSOUND CHANGE APPLIER\n(C) 1992,2000 by Mark Rosenfelder\nFor more information see www.zompist.com\n\n" );
if (once)
{
printf( "Applying %s.sc to %s.lex\n\n", lexicon, rules );
if (ReadRules( rules ))
DoWords( lexicon, rules );
}
else
{
int done = FALSE;
while (!done)
{
printf( "\nEnter the name of a LEXICON.\n\n" );
printf( "For example, enter latin to specify latin.lex.\nEnter q to quit the program.\n-->" );
fgets( lexicon, 65, stdin );
if (strlen(lexicon))
lexicon[strlen(lexicon) - 1] = '\0';
if (!strcmp( lexicon, "q" ))
done = TRUE;
else
{
printf( "Enter the name of a RULES FILE.\n\n" );
printf( "For example, enter french to specify french.sc.\n" );
printf( "The output words would be stored in french.out.\n-->" );
fgets( rules, 65, stdin );
if (strlen(rules))
rules[strlen(rules) - 1] = '\0';
if (ReadRules( rules ))
DoWords( lexicon, rules );
}
}
}
printf( "\nThank you for using the SOUND CHANGE APPLIER!\n" );
} /*main*/