116 lines
3.2 KiB
Ruby
116 lines
3.2 KiB
Ruby
|
# This script reads transformation rules for English G2P from a text file
|
||
|
# and generates C++ code.
|
||
|
# The rule format is described here: http://www.zompist.com/sounds.htm
|
||
|
|
||
|
$characterClasses = {}
|
||
|
|
||
|
def formatRule(searchValue, replaceValue, contextBegin, contextEnd)
|
||
|
return nil if replaceValue == searchValue
|
||
|
|
||
|
# Special case: search and replace values are character classes
|
||
|
if $characterClasses.has_key?(searchValue) && $characterClasses.has_key?(replaceValue)
|
||
|
searchCharacters = $characterClasses[searchValue]
|
||
|
replaceCharacters = $characterClasses[replaceValue]
|
||
|
count = [searchCharacters.length, replaceCharacters.length].min
|
||
|
result = ''
|
||
|
0.upto(count - 1) do |i|
|
||
|
subrule = formatRule(searchCharacters[i], replaceCharacters[i], contextBegin, contextEnd)
|
||
|
if subrule
|
||
|
result << ' ' if !result.empty?
|
||
|
result << subrule
|
||
|
end
|
||
|
end
|
||
|
return result
|
||
|
end
|
||
|
|
||
|
# Special characters
|
||
|
searchValue = Regexp.escape(searchValue)
|
||
|
contextBegin = Regexp.escape(contextBegin)
|
||
|
contextEnd = Regexp.escape(contextEnd)
|
||
|
|
||
|
# Anchors
|
||
|
contextBegin.sub!(/^\\\#/, '^')
|
||
|
contextEnd.sub!(/\\\#$/, '$')
|
||
|
|
||
|
hasContextBegin = contextBegin != ''
|
||
|
hasContextEnd = contextEnd != ''
|
||
|
regexString = searchValue;
|
||
|
if hasContextBegin
|
||
|
regexString = "(#{contextBegin})" + regexString
|
||
|
replaceValue = "$1" + replaceValue
|
||
|
end
|
||
|
if hasContextEnd
|
||
|
regexString = regexString + "(#{contextEnd})"
|
||
|
replaceValue = replaceValue + (hasContextBegin ? "$2" : "$1")
|
||
|
end
|
||
|
|
||
|
# Optional parts
|
||
|
regexString.gsub!(/\\\((.*?)\\\)/, '(?:\\1)?')
|
||
|
|
||
|
# Fold repeated characters/classes
|
||
|
regexString.gsub!(/([w])\1\1\1\1/, '\\1{5}')
|
||
|
regexString.gsub!(/(\w)\1\1\1/, '\\1{4}')
|
||
|
regexString.gsub!(/(\w)\1\1/, '\\1{3}')
|
||
|
regexString.gsub!(/(\w)\1/, '\\1{2}')
|
||
|
|
||
|
# Character classes
|
||
|
regexString.gsub!(/./) do |ch|
|
||
|
$characterClasses.has_key?(ch) ? "[#{$characterClasses[ch]}]" : ch
|
||
|
end
|
||
|
|
||
|
# C++ string escaping
|
||
|
regexString.gsub!(/[\\"]/, '\\\\\\\\')
|
||
|
replaceValue.gsub!(/[\\"]/, '\\\\\\\\')
|
||
|
|
||
|
return "{ wregex(L\"#{regexString}\"), L\"#{replaceValue}\" },"
|
||
|
end
|
||
|
|
||
|
# Read rules
|
||
|
lines = File.read('../lib/soundchange/english.sc', :encoding => 'iso-8859-1').split(/\r?\n/)
|
||
|
|
||
|
# Parse character class definitions
|
||
|
characterClassLineCount = 0
|
||
|
lines.each_with_index do |line, index|
|
||
|
# Skip comments
|
||
|
next if line.start_with? '*'
|
||
|
|
||
|
match = /^(.)=(.+)$/.match(line)
|
||
|
if match
|
||
|
characterClassLineCount = index + 1
|
||
|
else
|
||
|
break
|
||
|
end
|
||
|
|
||
|
name = match[1]
|
||
|
value = match[2]
|
||
|
$characterClasses[name] = value
|
||
|
end
|
||
|
|
||
|
# Parse rules and convert them to C++
|
||
|
File.open('g2pRules.cpp', 'w:UTF-8') do |file|
|
||
|
file.print "// Generated by #{__FILE__}; don't modify by hand!\n\n"
|
||
|
lines.drop(characterClassLineCount).each do |line|
|
||
|
# Handle comments
|
||
|
comment = /^\*(.*)$/.match(line)
|
||
|
if comment
|
||
|
file.puts "//#{comment[1]}"
|
||
|
next
|
||
|
end
|
||
|
|
||
|
# Handle rules
|
||
|
rule = /^(.+)\/(.*)\/(.*)_(.*)$/.match(line)
|
||
|
if rule
|
||
|
searchValue = rule[1]
|
||
|
replaceValue = rule[2]
|
||
|
contextBegin = rule[3]
|
||
|
contextEnd = rule[4]
|
||
|
|
||
|
file.puts formatRule(searchValue, replaceValue, contextBegin, contextEnd)
|
||
|
next
|
||
|
end
|
||
|
|
||
|
raise "Invalid rule: #{line}"
|
||
|
end
|
||
|
end
|
||
|
|