rhubarb-lip-sync/rhubarb/src/recognition/g2pRules.rb

128 lines
3.4 KiB
Ruby

# This script reads transformation rules for English G2P from a text file
# and generates C++ code.
# The rule format is described here: http://www.zompist.com/sounds.htm
$characterClasses = {}
def formatRule(searchValue, replaceValue, contextBegin, contextEnd)
return nil if replaceValue == searchValue
# Special case: search and replace values are character classes
if $characterClasses.has_key?(searchValue) && $characterClasses.has_key?(replaceValue)
searchCharacters = $characterClasses[searchValue]
replaceCharacters = $characterClasses[replaceValue]
count = [searchCharacters.length, replaceCharacters.length].min
result = ''
0.upto(count - 1) do |i|
subrule = formatRule(searchCharacters[i], replaceCharacters[i], contextBegin, contextEnd)
if subrule
result << ' ' if !result.empty?
result << subrule
end
end
return result
end
# Special characters
searchValue = Regexp.escape(searchValue)
contextBegin = Regexp.escape(contextBegin)
contextEnd = Regexp.escape(contextEnd)
replaceValue.gsub!(/\$/, '$$')
# Anchors
contextBegin.sub!(/^\\\#/, '^')
contextEnd.sub!(/\\\#$/, '$')
hasContextBegin = contextBegin != ''
hasContextEnd = contextEnd != ''
regexString = searchValue;
if hasContextBegin
regexString = "(#{contextBegin})" + regexString
replaceValue = "$1" + replaceValue
end
if hasContextEnd
regexString = regexString + "(#{contextEnd})"
replaceValue = replaceValue + (hasContextBegin ? "$2" : "$1")
end
# Optional parts
regexString.gsub!(/\\\((.*?)\\\)/, '(?:\\1)?')
# Fold repeated characters/classes
regexString.gsub!(/([w])\1\1\1\1/, '\\1{5}')
regexString.gsub!(/(\w)\1\1\1/, '\\1{4}')
regexString.gsub!(/(\w)\1\1/, '\\1{3}')
regexString.gsub!(/(\w)\1/, '\\1{2}')
# Character classes
regexString.gsub!(/./) do |ch|
$characterClasses.has_key?(ch) ? "[#{$characterClasses[ch]}]" : ch
end
# C++ string escaping
regexString.gsub!(/[\\"]/, '\\\\\\\\')
replaceValue.gsub!(/[\\"]/, '\\\\\\\\')
return "{ wregex(L\"#{regexString}\"), L\"#{replaceValue}\" },"
end
# Read rules
lines = File.read('../../lib/soundchange/english.sc', :encoding => 'iso-8859-1').split(/\r?\n/)
# Add supplementary rules
lines.push(
'* There are a number of cases not covered by these rules.',
'* Let\'s add some reasonable fallback rules.',
'a/â/_',
'e/@/_',
'i/ë/_',
'o/ö/_',
'q/k/_'
)
# Parse character class definitions
characterClassLineCount = 0
lines.each_with_index do |line, index|
# Skip comments
next if line.start_with? '*'
match = /^(.)=(.+)$/.match(line)
if match
characterClassLineCount = index + 1
else
break
end
name = match[1]
value = match[2]
$characterClasses[name] = value
end
# Parse rules and convert them to C++
File.open('g2pRules.cpp', 'w:UTF-8') do |file|
file.print "// Generated by #{__FILE__}; don't modify by hand!\n\n"
lines.drop(characterClassLineCount).each do |line|
# Handle comments
comment = /^\*(.*)$/.match(line)
if comment
file.puts "//#{comment[1]}"
next
end
# Handle rules
rule = /^(.+)\/(.*)\/(.*)_(.*)$/.match(line)
if rule
searchValue = rule[1]
replaceValue = rule[2]
contextBegin = rule[3]
contextEnd = rule[4]
file.puts formatRule(searchValue, replaceValue, contextBegin, contextEnd)
next
end
raise "Invalid rule: #{line}"
end
end