128 lines
3.4 KiB
Ruby
128 lines
3.4 KiB
Ruby
# This script reads transformation rules for English G2P from a text file
|
|
# and generates C++ code.
|
|
# The rule format is described here: http://www.zompist.com/sounds.htm
|
|
|
|
$characterClasses = {}
|
|
|
|
def formatRule(searchValue, replaceValue, contextBegin, contextEnd)
|
|
return nil if replaceValue == searchValue
|
|
|
|
# Special case: search and replace values are character classes
|
|
if $characterClasses.has_key?(searchValue) && $characterClasses.has_key?(replaceValue)
|
|
searchCharacters = $characterClasses[searchValue]
|
|
replaceCharacters = $characterClasses[replaceValue]
|
|
count = [searchCharacters.length, replaceCharacters.length].min
|
|
result = ''
|
|
0.upto(count - 1) do |i|
|
|
subrule = formatRule(searchCharacters[i], replaceCharacters[i], contextBegin, contextEnd)
|
|
if subrule
|
|
result << ' ' if !result.empty?
|
|
result << subrule
|
|
end
|
|
end
|
|
return result
|
|
end
|
|
|
|
# Special characters
|
|
searchValue = Regexp.escape(searchValue)
|
|
contextBegin = Regexp.escape(contextBegin)
|
|
contextEnd = Regexp.escape(contextEnd)
|
|
replaceValue.gsub!(/\$/, '$$')
|
|
|
|
# Anchors
|
|
contextBegin.sub!(/^\\\#/, '^')
|
|
contextEnd.sub!(/\\\#$/, '$')
|
|
|
|
hasContextBegin = contextBegin != ''
|
|
hasContextEnd = contextEnd != ''
|
|
regexString = searchValue;
|
|
if hasContextBegin
|
|
regexString = "(#{contextBegin})" + regexString
|
|
replaceValue = "$1" + replaceValue
|
|
end
|
|
if hasContextEnd
|
|
regexString = regexString + "(#{contextEnd})"
|
|
replaceValue = replaceValue + (hasContextBegin ? "$2" : "$1")
|
|
end
|
|
|
|
# Optional parts
|
|
regexString.gsub!(/\\\((.*?)\\\)/, '(?:\\1)?')
|
|
|
|
# Fold repeated characters/classes
|
|
regexString.gsub!(/([w])\1\1\1\1/, '\\1{5}')
|
|
regexString.gsub!(/(\w)\1\1\1/, '\\1{4}')
|
|
regexString.gsub!(/(\w)\1\1/, '\\1{3}')
|
|
regexString.gsub!(/(\w)\1/, '\\1{2}')
|
|
|
|
# Character classes
|
|
regexString.gsub!(/./) do |ch|
|
|
$characterClasses.has_key?(ch) ? "[#{$characterClasses[ch]}]" : ch
|
|
end
|
|
|
|
# C++ string escaping
|
|
regexString.gsub!(/[\\"]/, '\\\\\\\\')
|
|
replaceValue.gsub!(/[\\"]/, '\\\\\\\\')
|
|
|
|
return "{ wregex(L\"#{regexString}\"), L\"#{replaceValue}\" },"
|
|
end
|
|
|
|
# Read rules
|
|
lines = File.read('../../lib/soundchange/english.sc', :encoding => 'iso-8859-1').split(/\r?\n/)
|
|
|
|
# Add supplementary rules
|
|
lines.push(
|
|
'* There are a number of cases not covered by these rules.',
|
|
'* Let\'s add some reasonable fallback rules.',
|
|
'a/â/_',
|
|
'e/@/_',
|
|
'i/ë/_',
|
|
'o/ö/_',
|
|
'q/k/_'
|
|
)
|
|
|
|
# Parse character class definitions
|
|
characterClassLineCount = 0
|
|
lines.each_with_index do |line, index|
|
|
# Skip comments
|
|
next if line.start_with? '*'
|
|
|
|
match = /^(.)=(.+)$/.match(line)
|
|
if match
|
|
characterClassLineCount = index + 1
|
|
else
|
|
break
|
|
end
|
|
|
|
name = match[1]
|
|
value = match[2]
|
|
$characterClasses[name] = value
|
|
end
|
|
|
|
# Parse rules and convert them to C++
|
|
File.open('g2pRules.cpp', 'w:UTF-8') do |file|
|
|
file.print "// Generated by #{__FILE__}; don't modify by hand!\n\n"
|
|
lines.drop(characterClassLineCount).each do |line|
|
|
# Handle comments
|
|
comment = /^\*(.*)$/.match(line)
|
|
if comment
|
|
file.puts "//#{comment[1]}"
|
|
next
|
|
end
|
|
|
|
# Handle rules
|
|
rule = /^(.+)\/(.*)\/(.*)_(.*)$/.match(line)
|
|
if rule
|
|
searchValue = rule[1]
|
|
replaceValue = rule[2]
|
|
contextBegin = rule[3]
|
|
contextEnd = rule[4]
|
|
|
|
file.puts formatRule(searchValue, replaceValue, contextBegin, contextEnd)
|
|
next
|
|
end
|
|
|
|
raise "Invalid rule: #{line}"
|
|
end
|
|
end
|
|
|