Simplify VoiceActivityDetector state

This commit is contained in:
Daniel Wolf 2019-10-04 18:02:02 +02:00
parent 273ba16bf8
commit 75876ef4a8
1 changed files with 14 additions and 35 deletions

View File

@ -185,8 +185,6 @@ enum class Aggressiveness {
class VoiceActivityDetector(aggressiveness: Aggressiveness = Aggressiveness.Quality) { class VoiceActivityDetector(aggressiveness: Aggressiveness = Aggressiveness.Quality) {
private var frameCount: Int = 0 private var frameCount: Int = 0
private var overhang: Int = 0
private var speechFrameCount: Int = 0
// PDF parameters // PDF parameters
private val noiseMeans = NOISE_DATA_MEANS.clone() private val noiseMeans = NOISE_DATA_MEANS.clone()
@ -208,17 +206,15 @@ class VoiceActivityDetector(aggressiveness: Aggressiveness = Aggressiveness.Qual
private val median = IntArray(CHANNEL_COUNT) { 1600 } private val median = IntArray(CHANNEL_COUNT) { 1600 }
private data class ThresholdsRecord( private data class ThresholdsRecord(
val overhangMax1: Int,
val overhangMax2: Int,
val localThreshold: Int, val localThreshold: Int,
val globalThreshold: Int val globalThreshold: Int
) )
private val thresholds = when(aggressiveness) { private val thresholds = when(aggressiveness) {
Aggressiveness.Quality -> ThresholdsRecord(8, 14, 24, 57) Aggressiveness.Quality -> ThresholdsRecord(24, 57)
Aggressiveness.LowBitrate -> ThresholdsRecord(8, 14, 37, 100) Aggressiveness.LowBitrate -> ThresholdsRecord(37, 100)
Aggressiveness.Aggressive -> ThresholdsRecord(6, 9, 82, 285) Aggressiveness.Aggressive -> ThresholdsRecord(82, 285)
Aggressiveness.VeryAggressive -> ThresholdsRecord(6, 9, 94, 1100) Aggressiveness.VeryAggressive -> ThresholdsRecord(94, 1100)
} }
/** /**
@ -228,10 +224,10 @@ class VoiceActivityDetector(aggressiveness: Aggressiveness = Aggressiveness.Qual
* @param[features] Feature vector of length [CHANNEL_COUNT] = log10(energy in frequency band) * @param[features] Feature vector of length [CHANNEL_COUNT] = log10(energy in frequency band)
* @param[totalEnergy] Total energy in audio frame. * @param[totalEnergy] Total energy in audio frame.
* @param[frameLength] Number of input samples. * @param[frameLength] Number of input samples.
* @return VAD decision. 0: no active speech; 1-6: active speech * @return VAD decision. True if frame contains speech, false otherwise.
*/ */
private fun getGmmProbability(features: List<Int>, totalEnergy: Int, frameLength: Int): Int { private fun getGmmProbability(features: List<Int>, totalEnergy: Int, frameLength: Int): Boolean {
var vadFlag = 0 var speech = false
var tmp: Int var tmp: Int
var tmp1: Int var tmp1: Int
var tmp2: Int var tmp2: Int
@ -303,7 +299,7 @@ class VoiceActivityDetector(aggressiveness: Aggressiveness = Aggressiveness.Qual
// Local VAD decision. // Local VAD decision.
if ((logLikelihoodRatio * 4) > thresholds.localThreshold) { if ((logLikelihoodRatio * 4) > thresholds.localThreshold) {
vadFlag = 1 speech = true
} }
// Calculate local noise probabilities used later when updating the GMM. // Calculate local noise probabilities used later when updating the GMM.
@ -332,7 +328,7 @@ class VoiceActivityDetector(aggressiveness: Aggressiveness = Aggressiveness.Qual
} }
// Make a global VAD decision. // Make a global VAD decision.
vadFlag = vadFlag or (if (sumLogLikelihoodRatios >= thresholds.globalThreshold) 1 else 0) speech = speech || (sumLogLikelihoodRatios >= thresholds.globalThreshold)
// Update the model parameters. // Update the model parameters.
var maxspe = 12800 var maxspe = 12800
@ -354,7 +350,7 @@ class VoiceActivityDetector(aggressiveness: Aggressiveness = Aggressiveness.Qual
// Update noise mean vector if the frame consists of noise only. // Update noise mean vector if the frame consists of noise only.
var nmk2 = nmk var nmk2 = nmk
if (vadFlag == 0) { if (!speech) {
// deltaN = (x-mu)/sigma^2 // deltaN = (x-mu)/sigma^2
// ngprvec[k] = noiseProbability[k] / (noiseProbability[0] + noiseProbability[1]) // ngprvec[k] = noiseProbability[k] / (noiseProbability[0] + noiseProbability[1])
@ -381,7 +377,7 @@ class VoiceActivityDetector(aggressiveness: Aggressiveness = Aggressiveness.Qual
} }
noiseMeans[gaussian] = nmk3 noiseMeans[gaussian] = nmk3
if (vadFlag != 0) { if (speech) {
// Update speech mean vector: // Update speech mean vector:
// deltaS = (x-mu)/sigma^2 // deltaS = (x-mu)/sigma^2
// sgprvec[k] = speechProbability[k] / (speechProbability[0] + speechProbability[1]) // sgprvec[k] = speechProbability[k] / (speechProbability[0] + speechProbability[1])
@ -516,23 +512,7 @@ class VoiceActivityDetector(aggressiveness: Aggressiveness = Aggressiveness.Qual
frameCount++ frameCount++
} }
// Smooth with respect to transition hysteresis. return speech
if (vadFlag == 0) {
if (overhang > 0) {
vadFlag = 2 + overhang
overhang--
}
speechFrameCount = 0
} else {
speechFrameCount++
if (speechFrameCount > MAX_SPEECH_FRAMES) {
speechFrameCount = MAX_SPEECH_FRAMES
overhang = thresholds.overhangMax2
} else {
overhang = thresholds.overhangMax1
}
}
return vadFlag
} }
/** /**
@ -726,9 +706,8 @@ class VoiceActivityDetector(aggressiveness: Aggressiveness = Aggressiveness.Qual
val (features, totalEnergy) = calculateFeatures(audioFrame) val (features, totalEnergy) = calculateFeatures(audioFrame)
// Make a VAD // Make a VAD
val vadResult = getGmmProbability(features, totalEnergy, audioFrame.size) val speech = getGmmProbability(features, totalEnergy, audioFrame.size)
// return vad != 0 return speech
return vadResult == 1
} }
} }