# This script reads transformation rules for English G2P from a text file # and generates C++ code. # The rule format is described here: http://www.zompist.com/sounds.htm $characterClasses = {} def formatRule(searchValue, replaceValue, contextBegin, contextEnd) return nil if replaceValue == searchValue # Special case: search and replace values are character classes if $characterClasses.has_key?(searchValue) && $characterClasses.has_key?(replaceValue) searchCharacters = $characterClasses[searchValue] replaceCharacters = $characterClasses[replaceValue] count = [searchCharacters.length, replaceCharacters.length].min result = '' 0.upto(count - 1) do |i| subrule = formatRule(searchCharacters[i], replaceCharacters[i], contextBegin, contextEnd) if subrule result << ' ' if !result.empty? result << subrule end end return result end # Special characters searchValue = Regexp.escape(searchValue) contextBegin = Regexp.escape(contextBegin) contextEnd = Regexp.escape(contextEnd) replaceValue.gsub!(/\$/, '$$') # Anchors contextBegin.sub!(/^\\\#/, '^') contextEnd.sub!(/\\\#$/, '$') hasContextBegin = contextBegin != '' hasContextEnd = contextEnd != '' regexString = searchValue; if hasContextBegin regexString = "(#{contextBegin})" + regexString replaceValue = "$1" + replaceValue end if hasContextEnd regexString = regexString + "(#{contextEnd})" replaceValue = replaceValue + (hasContextBegin ? "$2" : "$1") end # Optional parts regexString.gsub!(/\\\((.*?)\\\)/, '(?:\\1)?') # Fold repeated characters/classes regexString.gsub!(/([w])\1\1\1\1/, '\\1{5}') regexString.gsub!(/(\w)\1\1\1/, '\\1{4}') regexString.gsub!(/(\w)\1\1/, '\\1{3}') regexString.gsub!(/(\w)\1/, '\\1{2}') # Character classes regexString.gsub!(/./) do |ch| $characterClasses.has_key?(ch) ? "[#{$characterClasses[ch]}]" : ch end # C++ string escaping regexString.gsub!(/[\\"]/, '\\\\\\\\') replaceValue.gsub!(/[\\"]/, '\\\\\\\\') return "{ wregex(L\"#{regexString}\"), L\"#{replaceValue}\" }," end # Read rules lines = File.read('../../lib/soundchange/english.sc', :encoding => 'iso-8859-1').split(/\r?\n/) # Add supplementary rules lines.push( '* There are a number of cases not covered by these rules.', '* Let\'s add some reasonable fallback rules.', 'a/â/_', 'e/@/_', 'i/ë/_', 'o/ö/_', 'q/k/_' ) # Parse character class definitions characterClassLineCount = 0 lines.each_with_index do |line, index| # Skip comments next if line.start_with? '*' match = /^(.)=(.+)$/.match(line) if match characterClassLineCount = index + 1 else break end name = match[1] value = match[2] $characterClasses[name] = value end # Parse rules and convert them to C++ File.open('g2pRules.cpp', 'w:UTF-8') do |file| file.print "// Generated by #{__FILE__}; don't modify by hand!\n\n" lines.drop(characterClassLineCount).each do |line| # Handle comments comment = /^\*(.*)$/.match(line) if comment file.puts "//#{comment[1]}" next end # Handle rules rule = /^(.+)\/(.*)\/(.*)_(.*)$/.match(line) if rule searchValue = rule[1] replaceValue = rule[2] contextBegin = rule[3] contextEnd = rule[4] file.puts formatRule(searchValue, replaceValue, contextBegin, contextEnd) next end raise "Invalid rule: #{line}" end end