rhubarb-lip-sync/rhubarb/src/main/kotlin/vad.kt

import kotlin.math.absoluteValue

///////////////////////////////////////////////////////////////////////////////////////////////////////
// webrtc/common_audio/signal_processing/include/signal_processing_library.h

// Macros specific for the fixed point implementation
val WEBRTC_SPL_WORD16_MAX = 32767

///////////////////////////////////////////////////////////////////////////////////////////////////////
// webrtc/common_audio/signal_processing/include/spl_inl.h
// webrtc/common_audio/signal_processing/spl_inl.c

// Table used by CountLeadingZeros32_NotBuiltin. For each UInt n
// that's a sequence of 0 bits followed by a sequence of 1 bits, the entry at
// index (n * 0x8c0b2891) shr 26 in this table gives the number of zero bits in
// n.
val kCountLeadingZeros32_Table = intArrayOf(
	32, 8,  17, -1, -1, 14, -1, -1, -1, 20, -1, -1, -1, 28, -1, 18,
	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0,  26, 25, 24,
	4,  11, 23, 31, 3,  7,  10, 16, 22, 30, -1, -1, 2,  6,  13, 9,
	-1, 15, -1, 21, -1, 29, 19, -1, -1, -1, -1, -1, 1,  27, 5,  12
).apply { assert(size == 64) }

// Returns the number of leading zero bits in the argument.
fun CountLeadingZeros32(n: UInt): Int {
	// Normalize n by rounding up to the nearest number that is a sequence of 0
	// bits followed by a sequence of 1 bits. This number has the same number of
	// leading zeros as the original n. There are exactly 33 such values.
	var normalized = n or (n shr 1)
	normalized = normalized or (normalized shr 2)
	normalized = normalized or (normalized shr 4)
	normalized = normalized or (normalized shr 8)
	normalized = normalized or (normalized shr 16)

	// Multiply the modified n with a constant selected (by exhaustive search)
	// such that each of the 33 possible values of n give a product whose 6 most
	// significant bits are unique. Then look up the answer in the table.
	return kCountLeadingZeros32_Table[((normalized * 0x8c0b2891u) shr 26).toInt()]
}

// Return the number of steps a signed int can be left-shifted without overflow,
// or 0 if a == 0.
inline fun NormW32(a: Int): Int {
	return if (a == 0)
		0
	else
		CountLeadingZeros32((if (a < 0) a.inv() else a).toUInt()) - 1
}

// Return the number of steps an unsigned int can be left-shifted without overflow,
// or 0 if a == 0.
inline fun NormU32(a: UInt): Int {
	return if (a == 0u) 0 else CountLeadingZeros32(a)
}

inline fun GetSizeInBits(n: UInt): Int {
	return 32 - CountLeadingZeros32(n)
}

///////////////////////////////////////////////////////////////////////////////////////////////////////
// webrtc/common_audio/signal_processing/get_scaling_square.c

//
// GetScalingSquare(...)
//
// Returns the # of bits required to scale the samples specified in the
// |in_vector| parameter so that, if the squares of the samples are added the
// # of times specified by the |times| parameter, the 32-bit addition will not
// overflow (result in Int).
//
// Input:
//      - in_vector         : Input vector to check scaling on
//      - in_vector_length  : Samples in |in_vector|
//      - times             : Number of additions to be performed
//
// Return value             : Number of right bit shifts needed to avoid
//                            overflow in the addition calculation
fun GetScalingSquare(buffer: AudioBuffer, times: Int): Int {
	var maxAbsSample = -1
	for (i in 0 until buffer.size) {
		val absSample = buffer[i].toInt().absoluteValue
		if (absSample > maxAbsSample) {
			maxAbsSample = absSample
		}
	}

	if (maxAbsSample == 0) {
		return 0 // Since norm(0) returns 0
	}

	val t = NormW32(maxAbsSample * maxAbsSample)
	val bitCount = GetSizeInBits(times.toUInt())
	return if (t > bitCount) 0 else bitCount - t
}

///////////////////////////////////////////////////////////////////////////////////////////////////////
// webrtc/common_audio/signal_processing/energy.c

data class EnergyResult(
	// Number of left bit shifts needed to get the physical energy value, i.e, to get the Q0 value
	val scaleFactor: Int,

	// Energy value in Q(-|scale_factor|)
	val energy: Int
)

//
// Energy(...)
//
// Calculates the energy of a vector
//
// Input:
//      - vector        : Vector which the energy should be calculated on
//      - vector_length : Number of samples in vector
//
// Output:
//      - scale_factor  : Number of left bit shifts needed to get the physical
//                        energy value, i.e, to get the Q0 value
//
// Return value         : Energy value in Q(-|scale_factor|)
//
fun Energy(buffer: AudioBuffer): EnergyResult {
	val scaling = GetScalingSquare(buffer, buffer.size)

	var energy = 0
	for (i in 0 until buffer.size) {
		energy += (buffer[i] * buffer[i]) shr scaling
	}

	return EnergyResult(scaling, energy)
}

///////////////////////////////////////////////////////////////////////////////////////////////////////
// webrtc/common_audio/signal_processing/division_operations.c

//
// DivW32W16(...)
//
// Divides a Int |num| by a Int |den|.
//
// If |den|==0, (Int)0x7FFFFFFF is returned.
//
// Input:
//      - num       : Numerator
//      - den       : Denominator
//
// Return value     : Result of the division (as a Int), i.e., the
//                    integer part of num/den.
//
fun DivW32W16(num: Int, den: Int) =
	if (den != 0) num / den else Int.MAX_VALUE

///////////////////////////////////////////////////////////////////////////////////////////////////////
// webrtc/common_audio/vad/vad_gmm.c

data class GaussianProbabilityResult(
	// (probability for |input|) = 1 / |std| * exp(-(|input| - |mean|)^2 / (2 * |std|^2))
	val probability: Int,
	// Input used when updating the model, Q11.
	// |delta| = (|input| - |mean|) / |std|^2.
	val delta: Int
)

val kCompVar = 22005;
val kLog2Exp = 5909; // log2(exp(1)) in Q12.

// Calculates the probability for |input|, given that |input| comes from a
// normal distribution with mean and standard deviation (|mean|, |std|).
//
// Inputs:
//      - input         : input sample in Q4.
//      - mean          : mean input in the statistical model, Q7.
//      - std           : standard deviation, Q7.
//
// Output:
//
//      - delta         : input used when updating the model, Q11.
//                        |delta| = (|input| - |mean|) / |std|^2.
//
// Return:
//   (probability for |input|) =
//    1 / |std| * exp(-(|input| - |mean|)^2 / (2 * |std|^2));
//---------------------------------------------------------------------------------
// For a normal distribution, the probability of |input| is calculated and
// returned (in Q20). The formula for normal distributed probability is
//
// 1 / s * exp(-(x - m)^2 / (2 * s^2))
//
// where the parameters are given in the following Q domains:
// m = |mean| (Q7)
// s = |std| (Q7)
// x = |input| (Q4)
// in addition to the probability we output |delta| (in Q11) used when updating
// the noise/speech model.
fun GaussianProbability(input: Int, mean: Int, std: Int): GaussianProbabilityResult {
	var tmp16 = 0
	var inv_std = 0
	var inv_std2 = 0
	var exp_value = 0
	var tmp32 = 0

	// Calculate |inv_std| = 1 / s, in Q10.
	// 131072 = 1 in Q17, and (|std| shr 1) is for rounding instead of truncation.
	// Q-domain: Q17 / Q7 = Q10.
	tmp32 = 131072 + (std shr 1)
	inv_std = DivW32W16(tmp32, std)

	// Calculate |inv_std2| = 1 / s^2, in Q14.
	tmp16 = inv_std shr 2 // Q10 -> Q8.
	// Q-domain: (Q8 * Q8) shr 2 = Q14.
	inv_std2 = (tmp16 * tmp16) shr 2

	tmp16 = input shl 3 // Q4 -> Q7
	tmp16 -= mean // Q7 - Q7 = Q7

	// To be used later, when updating noise/speech model.
	// |delta| = (x - m) / s^2, in Q11.
	// Q-domain: (Q14 * Q7) shr 10 = Q11.
	val delta = (inv_std2 * tmp16) shr 10

	// Calculate the exponent |tmp32| = (x - m)^2 / (2 * s^2), in Q10. Replacing
	// division by two with one shift.
	// Q-domain: (Q11 * Q7) shr 8 = Q10.
	tmp32 = (delta * tmp16) shr 9

	// If the exponent is small enough to give a non-zero probability we calculate
	// |exp_value| ~= exp(-(x - m)^2 / (2 * s^2))
	//             ~= exp2(-log2(exp(1)) * |tmp32|).
	if (tmp32 < kCompVar) {
		// Calculate |tmp16| = log2(exp(1)) * |tmp32|, in Q10.
		// Q-domain: (Q12 * Q10) shr 12 = Q10.
		tmp16 = (kLog2Exp * tmp32) shr 12
		tmp16 = -tmp16
		exp_value = 0x0400 or (tmp16 and 0x03FF)
		tmp16 = tmp16 xor 0xFFFF
		tmp16 = tmp16 shr 10
		tmp16 += 1;
		// Get |exp_value| = exp(-|tmp32|) in Q10.
		exp_value = exp_value shr tmp16
	}

	// Calculate and return (1 / s) * exp(-(x - m)^2 / (2 * s^2)), in Q20.
	// Q-domain: Q10 * Q10 = Q20.
	val probability = inv_std * exp_value
	return GaussianProbabilityResult(probability, delta)
}

///////////////////////////////////////////////////////////////////////////////////////////////////////
// webrtc/common_audio/vad/vad_core.c

// Spectrum Weighting
val kSpectrumWeight = intArrayOf(6, 8, 10, 12, 14, 16).apply { assert(size == kNumChannels) }
val kNoiseUpdateConst: Int = 655 // Q15
val kSpeechUpdateConst: Int = 6554 // Q15
val kBackEta: Int = 154 // Q8
// Minimum difference between the two models, Q5
val kMinimumDifference = intArrayOf(544, 544, 576, 576, 576, 576).apply { assert(size == kNumChannels) }
// Upper limit of mean value for speech model, Q7
val kMaximumSpeech = intArrayOf(11392, 11392, 11520, 11520, 11520, 11520).apply { assert(size == kNumChannels) }
// Minimum value for mean value
val kMinimumMean = intArrayOf(640, 768).apply { assert(size == kNumGaussians) }
// Upper limit of mean value for noise model, Q7
val kMaximumNoise = intArrayOf(9216, 9088, 8960, 8832, 8704, 8576).apply { assert(size == kNumChannels) }
// Start values for the Gaussian models, Q7
// Weights for the two Gaussians for the six channels (noise)
val kNoiseDataWeights = intArrayOf(34, 62, 72, 66, 53, 25, 94, 66, 56, 62, 75, 103).apply { assert(size == kTableSize) }
// Weights for the two Gaussians for the six channels (speech)
val kSpeechDataWeights = intArrayOf(48, 82, 45, 87, 50, 47, 80, 46, 83, 41, 78, 81).apply { assert(size == kTableSize) }
// Means for the two Gaussians for the six channels (noise)
val kNoiseDataMeans = intArrayOf(6738, 4892, 7065, 6715, 6771, 3369, 7646, 3863, 7820, 7266, 5020, 4362).apply { assert(size == kTableSize) }
// Means for the two Gaussians for the six channels (speech)
val kSpeechDataMeans = intArrayOf(8306, 10085, 10078, 11823, 11843, 6309, 9473, 9571, 10879, 7581, 8180, 7483).apply { assert(size == kTableSize) }
// Stds for the two Gaussians for the six channels (noise)
val kNoiseDataStds = intArrayOf(378, 1064, 493, 582, 688, 593, 474, 697, 475, 688, 421, 455).apply { assert(size == kTableSize) }
// Stds for the two Gaussians for the six channels (speech)
val kSpeechDataStds = intArrayOf(555, 505, 567, 524, 585, 1231, 509, 828, 492, 1540, 1079, 850).apply { assert(size == kTableSize) }

// Constants used in GmmProbability().
//
// Maximum number of counted speech (VAD = 1) frames in a row.
val kMaxSpeechFrames: Int = 6
// Minimum standard deviation for both speech and noise.
val kMinStd: Int = 384

// Number of frequency bands (named channels)
val kNumChannels = 6

// Number of Gaussians per channel in the GMM
val kNumGaussians = 2

// Index = gaussian * kNumChannels + channel
val kTableSize = kNumChannels * kNumGaussians

// Minimum energy required to trigger audio signal
val kMinEnergy = 10

/**
 * The VAD operating modes in order of increasing aggressiveness.
 * A more aggressive VAD is more restrictive in reporting speech. Put in other words, the
 * probability of being speech when the VAD returns 1 is increased with increasing mode. As a
 * consequence, the missed detection rate also goes up.
 */
enum class Aggressiveness {
	Quality,
	LowBitrate,
	Aggressive,
	VeryAggressive
}

class VadInstT(aggressiveness: Aggressiveness = Aggressiveness.Quality) {
	// General variables
	var vad: Int = 1 // Speech active (=1)
	// TODO(bjornv): Change to |frame_count|.
	var frame_counter: Int = 0
	var over_hang: Int = 0
	var num_of_speech: Int = 0

	// PDF parameters
	var noise_means = kNoiseDataMeans.clone()
	var speech_means = kSpeechDataStds.clone()
	var noise_stds = kNoiseDataStds.clone()
	var speech_stds = kSpeechDataStds.clone()

	// Index vector
	// TODO(bjornv): Change to |age_vector|.
	var index_vector = IntArray(16 * kNumChannels) { 0 }

	// Minimum value vector
	var low_value_vector = IntArray(16 * kNumChannels) { 10000 }

	// Splitting filter states
	var upper_state = IntArray(5) { 0 }
	var lower_state = IntArray(5) { 0 }

	// High pass filter states
	var hp_filter_state = IntArray(4) { 0 }

	// Mean value memory for FindMinimum()
	// TODO(bjornv): Change to |median|.
	var mean_value = IntArray(kNumChannels) { 1600 }

	// Thresholds
	val over_hang_max_1: Int = when(aggressiveness) {
		Aggressiveness.Quality, Aggressiveness.LowBitrate -> 8
		Aggressiveness.Aggressive, Aggressiveness.VeryAggressive -> 6
	}
	val over_hang_max_2: Int = when(aggressiveness) {
		Aggressiveness.Quality, Aggressiveness.LowBitrate -> 14
		Aggressiveness.Aggressive, Aggressiveness.VeryAggressive -> 9
	}
	// TODO: Rename to localThreshold?
	val individual: Int = when(aggressiveness) {
		Aggressiveness.Quality -> 24
		Aggressiveness.LowBitrate -> 37
		Aggressiveness.Aggressive -> 82
		Aggressiveness.VeryAggressive -> 94
	}
	// TODO: Rename to globalThreshold?
	val total: Int = when(aggressiveness) {
		Aggressiveness.Quality -> 57
		Aggressiveness.LowBitrate -> 100
		Aggressiveness.Aggressive -> 285
		Aggressiveness.VeryAggressive -> 1100
	}
}

typealias WebRtcVadInst = VadInstT

// Calculates the weighted average w.r.t. number of Gaussians. The |data| are
// updated with an |offset| before averaging.
//
// - data     [i/o] : Data to average.
// - offset   [i]   : An offset added to |data|.
// - weights  [i]   : Weights used for averaging.
//
// returns          : The weighted average.
fun WeightedAverage(data: IntArray, channel: Int, offset: Int, weights: IntArray): Int {
	var weighted_average = 0
	for (k in 0 until kNumGaussians) {
		val index = k * kNumChannels + channel
		data[index] += offset
		weighted_average += data[index] * weights[index]
	}
	return weighted_average
}

// Calculates the probabilities for both speech and background noise using
// Gaussian Mixture Models (GMM). A hypothesis-test is performed to decide which
// type of signal is most probable.
//
// - self           [i/o] : Pointer to VAD instance
// - features       [i]   : Feature vector of length |kNumChannels|
//                          = log10(energy in frequency band)
// - total_power    [i]   : Total power in audio frame.
// - frame_length   [i]   : Number of input samples
//
// - returns              : the VAD decision (0 - noise, 1 - speech).
fun GmmProbability(self: VadInstT, features: IntArray, total_power: Int, frame_length: Int): Int {
	var vadflag = 0;
	var tmp_s16: Int
	var tmp1_s16: Int
	var tmp2_s16: Int
	val deltaN = IntArray(kTableSize)
	val deltaS = IntArray(kTableSize)
	val ngprvec = IntArray(kTableSize) { 0 } // Conditional probability = 0.
	val sgprvec = IntArray(kTableSize) { 0 } // Conditional probability = 0.
	var sum_log_likelihood_ratios = 0
	val noise_probability = IntArray(kNumGaussians)
	val speech_probability = IntArray(kNumGaussians)

	assert(frame_length == 80);

	if (total_power > kMinEnergy) {
		// The signal power of current frame is large enough for processing. The
		// processing consists of two parts:
		// 1) Calculating the likelihood of speech and thereby a VAD decision.
		// 2) Updating the underlying model, w.r.t., the decision made.

		// The detection scheme is an LRT with hypothesis
		// H0: Noise
		// H1: Speech
		//
		// We combine a global LRT with local tests, for each frequency sub-band,
		// here defined as |channel|.
		for (channel in 0 until kNumChannels) {
			// For each channel we model the probability with a GMM consisting of
			// |kNumGaussians|, with different means and standard deviations depending
			// on H0 or H1.
			var h0_test = 0
			var h1_test = 0
			for (k in 0 until kNumGaussians) {
				val gaussian = channel + k * kNumChannels;

				// Probability under H0, that is, probability of frame being noise.
				// Value given in Q27 = Q7 * Q20.
				val pNoise = GaussianProbability(features[channel], self.noise_means[gaussian], self.noise_stds[gaussian])
				deltaN[gaussian] = pNoise.delta
				noise_probability[k] = kNoiseDataWeights[gaussian] * pNoise.probability
				h0_test += noise_probability[k]; // Q27

				// Probability under H1, that is, probability of frame being speech.
				// Value given in Q27 = Q7 * Q20.
				val pSpeech = GaussianProbability(features[channel], self.speech_means[gaussian], self.speech_stds[gaussian])
				speech_probability[k] = kSpeechDataWeights[gaussian] * pSpeech.probability
				deltaS[gaussian] = pSpeech.delta
				h1_test += speech_probability[k]; // Q27
			}

			// Calculate the log likelihood ratio: log2(Pr{X|H1} / Pr{X|H1}).
			// Approximation:
			// log2(Pr{X|H1} / Pr{X|H1}) = log2(Pr{X|H1}*2^Q) - log2(Pr{X|H1}*2^Q)
			//                           = log2(h1_test) - log2(h0_test)
			//                           = log2(2^(31-shifts_h1)*(1+b1))
			//                             - log2(2^(31-shifts_h0)*(1+b0))
			//                           = shifts_h0 - shifts_h1
			//                             + log2(1+b1) - log2(1+b0)
			//                          ~= shifts_h0 - shifts_h1
			//
			// Note that b0 and b1 are values less than 1, hence, 0 <= log2(1+b0) < 1.
			// Further, b0 and b1 are independent and on the average the two terms cancel.
			val shifts_h0 = if (h0_test != 0) NormW32(h0_test) else 31
			val shifts_h1 = if (h1_test != 0) NormW32(h1_test) else 31
			val log_likelihood_ratio = shifts_h0 - shifts_h1;

			// Update |sum_log_likelihood_ratios| with spectrum weighting. This is
			// used for the global VAD decision.
			sum_log_likelihood_ratios += log_likelihood_ratio * kSpectrumWeight[channel]

			// Local VAD decision.
			if ((log_likelihood_ratio * 4) > self.individual) {
				vadflag = 1
			}

			// TODO(bjornv): The conditional probabilities below are applied on the
			// hard coded number of Gaussians set to two. Find a way to generalize.
			// Calculate local noise probabilities used later when updating the GMM.
			val h0 = h0_test shr 12 // Q15
			if (h0 > 0) {
				// High probability of noise. Assign conditional probabilities for each
				// Gaussian in the GMM.
				val tmp = (noise_probability[0] and 0xFFFFF000u.toInt()) shl 2 // Q29
				ngprvec[channel] = DivW32W16(tmp, h0); // Q14
				ngprvec[channel + kNumChannels] = 16384 - ngprvec[channel];
			} else {
				// Low noise probability. Assign conditional probability 1 to the first
				// Gaussian and 0 to the rest (which is already set at initialization).
				ngprvec[channel] = 16384;
			}

			// Calculate local speech probabilities used later when updating the GMM.
			val h1 = (h1_test shr 12); // Q15
			if (h1 > 0) {
				// High probability of speech. Assign conditional probabilities for each
				// Gaussian in the GMM. Otherwise use the initialized values, i.e., 0.
				val tmp = (speech_probability[0] and 0xFFFFF000u.toInt()) shl 2; // Q29
				sgprvec[channel] = DivW32W16(tmp, h1) // Q14
				sgprvec[channel + kNumChannels] = 16384 - sgprvec[channel];
			}
		}

		// Make a global VAD decision.
		vadflag = vadflag or (if (sum_log_likelihood_ratios >= self.total) 1 else 0)

		// Update the model parameters.
		var maxspe = 12800
		for (channel in 0 until kNumChannels) {
			// Get minimum value in past which is used for long term correction in Q4.
			val feature_minimum = FindMinimum(self, features[channel], channel)

			// Compute the "global" mean, that is the sum of the two means weighted.
			var noise_global_mean = WeightedAverage(self.noise_means, channel, 0, kNoiseDataWeights)
			tmp1_s16 = noise_global_mean shr 6 // Q8

			for (k in 0 until kNumGaussians) {
				val gaussian = channel + k * kNumChannels

				val nmk = self.noise_means[gaussian];
				val smk = self.speech_means[gaussian];
				var nsk = self.noise_stds[gaussian];
				var ssk = self.speech_stds[gaussian];

				// Update noise mean vector if the frame consists of noise only.
				var nmk2 = nmk;
				if (vadflag == 0) {
					// deltaN = (x-mu)/sigma^2
					// ngprvec[k] = |noise_probability[k]| /
					//   (|noise_probability[0]| + |noise_probability[1]|)

					// (Q14 * Q11 shr 11) = Q14.
					val delt = (ngprvec[gaussian] * deltaN[gaussian]) shr 11
					// Q7 + (Q14 * Q15 shr 22) = Q7.
					nmk2 = nmk + (delt * kNoiseUpdateConst) shr 22
				}

				// Long term correction of the noise mean.
				// Q8 - Q8 = Q8.
				val ndelt = (feature_minimum shl 4) - tmp1_s16;
				// Q7 + (Q8 * Q8) shr 9 = Q7.
				var nmk3 = nmk2 + ((ndelt * kBackEta) shr 9)

				// Control that the noise mean does not drift to much.
				tmp_s16 = (k + 5) shl 7
				if (nmk3 < tmp_s16) {
					nmk3 = tmp_s16
				}
				tmp_s16 = (72 + k - channel) shl 7
				if (nmk3 > tmp_s16) {
					nmk3 = tmp_s16
				}
				self.noise_means[gaussian] = nmk3;

				if (vadflag != 0) {
					// Update speech mean vector:
					// |deltaS| = (x-mu)/sigma^2
					// sgprvec[k] = |speech_probability[k]| /
					//   (|speech_probability[0]| + |speech_probability[1]|)

					// (Q14 * Q11) shr 11 = Q14.
					val delt = (sgprvec[gaussian] * deltaS[gaussian]) shr 11
					// Q14 * Q15 shr 21 = Q8.
					tmp_s16 = (delt * kSpeechUpdateConst) shr 21
					// Q7 + (Q8 shr 1) = Q7. With rounding.
					var smk2 = smk + ((tmp_s16 + 1) shr 1);

					// Control that the speech mean does not drift to much.
					val maxmu = maxspe + 640;
					if (smk2 < kMinimumMean[k]) {
						smk2 = kMinimumMean[k];
					}
					if (smk2 > maxmu) {
						smk2 = maxmu;
					}
					self.speech_means[gaussian] = smk2; // Q7.

					// (Q7 shr 3) = Q4. With rounding.
					tmp_s16 = ((smk + 4) shr 3);

					tmp_s16 = features[channel] - tmp_s16; // Q4
					// (Q11 * Q4 shr 3) = Q12.
					var tmp1_s32 = (deltaS[gaussian] * tmp_s16) shr 3;
					var tmp2_s32 = tmp1_s32 - 4096;
					tmp_s16 = sgprvec[gaussian] shr 2;
					// (Q14 shr 2) * Q12 = Q24.
					tmp1_s32 = tmp_s16 * tmp2_s32;

					tmp2_s32 = tmp1_s32 shr 4; // Q20

					// 0.1 * Q20 / Q7 = Q13.
					if (tmp2_s32 > 0) {
						tmp_s16 = DivW32W16(tmp2_s32, ssk * 10);
					} else {
						tmp_s16 = DivW32W16(-tmp2_s32, ssk * 10);
						tmp_s16 = -tmp_s16;
					}
					// Divide by 4 giving an update factor of 0.025 (= 0.1 / 4).
					// Note that division by 4 equals shift by 2, hence,
					// (Q13 shr 8) = (Q13 shr 6) / 4 = Q7.
					tmp_s16 += 128; // Rounding.
					ssk += (tmp_s16 shr 8);
					if (ssk < kMinStd) {
						ssk = kMinStd;
					}
					self.speech_stds[gaussian] = ssk;
				} else {
					// Update GMM variance vectors.
					// deltaN * (features[channel] - nmk) - 1
					// Q4 - (Q7 shr 3) = Q4.
					tmp_s16 = features[channel] - (nmk shr 3);
					// (Q11 * Q4 shr 3) = Q12.
					var tmp1_s32 = (deltaN[gaussian] * tmp_s16) shr 3;
					tmp1_s32 -= 4096;

					// (Q14 shr 2) * Q12 = Q24.
					tmp_s16 = (ngprvec[gaussian] + 2) shr 2;
					val tmp2_s32 = tmp_s16 * tmp1_s32;
					// Q20  * approx 0.001 (2^-10=0.0009766), hence,
					// (Q24 shr 14) = (Q24 shr 4) / 2^10 = Q20.
					tmp1_s32 = tmp2_s32 shr 14;

					// Q20 / Q7 = Q13.
					if (tmp1_s32 > 0) {
						tmp_s16 = DivW32W16(tmp1_s32, nsk);
					} else {
						tmp_s16 = DivW32W16(-tmp1_s32, nsk);
						tmp_s16 = -tmp_s16;
					}
					tmp_s16 += 32; // Rounding
					nsk += tmp_s16 shr 6; // Q13 shr 6 = Q7.
					if (nsk < kMinStd) {
						nsk = kMinStd;
					}
					self.noise_stds[gaussian] = nsk;
				}
			}

			// Separate models if they are too close.
			// |noise_global_mean| in Q14 (= Q7 * Q7).
			noise_global_mean = WeightedAverage(self.noise_means, channel, 0, kNoiseDataWeights)

			// |speech_global_mean| in Q14 (= Q7 * Q7).
			var speech_global_mean = WeightedAverage(self.speech_means, channel, 0, kSpeechDataWeights)

			// |diff| = "global" speech mean - "global" noise mean.
			// (Q14 shr 9) - (Q14 shr 9) = Q5.
			val diff = (speech_global_mean shr 9) - (noise_global_mean shr 9);
			if (diff < kMinimumDifference[channel]) {
				tmp_s16 = kMinimumDifference[channel] - diff;

				// |tmp1_s16| = ~0.8 * (kMinimumDifference - diff) in Q7.
				// |tmp2_s16| = ~0.2 * (kMinimumDifference - diff) in Q7.
				tmp1_s16 = (13 * tmp_s16) shr 2
				tmp2_s16 = (3 * tmp_s16) shr 2

				// Move Gaussian means for speech model by |tmp1_s16| and update
				// |speech_global_mean|. Note that |self.speech_means[channel]| is
				// changed after the call.
				speech_global_mean = WeightedAverage(self.speech_means, channel, tmp1_s16, kSpeechDataWeights)

				// Move Gaussian means for noise model by -|tmp2_s16| and update
				// |noise_global_mean|. Note that |self.noise_means[channel]| is
				// changed after the call.
				noise_global_mean = WeightedAverage(self.noise_means, channel, -tmp2_s16, kNoiseDataWeights)
			}

			// Control that the speech & noise means do not drift to much.
			maxspe = kMaximumSpeech[channel];
			tmp2_s16 = speech_global_mean shr 7
			if (tmp2_s16 > maxspe) {
				// Upper limit of speech model.
				tmp2_s16 -= maxspe;

				for (k in 0 until kNumGaussians) {
					self.speech_means[channel + k * kNumChannels] -= tmp2_s16;
				}
			}

			tmp2_s16 = noise_global_mean shr 7
			if (tmp2_s16 > kMaximumNoise[channel]) {
				tmp2_s16 -= kMaximumNoise[channel];

				for (k in 0 until kNumGaussians) {
					self.noise_means[channel + k * kNumChannels] -= tmp2_s16;
				}
			}
		}
		self.frame_counter++;
	}

	// Smooth with respect to transition hysteresis.
	if (vadflag == 0) {
		if (self.over_hang > 0) {
			vadflag = 2 + self.over_hang;
			self.over_hang--;
		}
		self.num_of_speech = 0;
	} else {
		self.num_of_speech++;
		if (self.num_of_speech > kMaxSpeechFrames) {
			self.num_of_speech = kMaxSpeechFrames;
			self.over_hang = self.over_hang_max_2;
		} else {
			self.over_hang = self.over_hang_max_1;
		}
	}
	return vadflag;
}

///////////////////////////////////////////////////////////////////////////////////////////////////////
// webrtc/common_audio/vad/vad_sp.c

val kSmoothingDown = 6553; // 0.2 in Q15.
val kSmoothingUp = 32439; // 0.99 in Q15.

// Updates and returns the smoothed feature minimum. As minimum we use the
// median of the five smallest feature values in a 100 frames long window.
// As long as |handle->frame_counter| is zero, that is, we haven't received any
// "valid" data, FindMinimum() outputs the default value of 1600.
//
// Inputs:
//      - feature_value : New feature value to update with.
//      - channel       : Channel number.
//
// Input & Output:
//      - handle        : State information of the VAD.
//
// Returns:
//                      : Smoothed minimum value for a moving window.
// Inserts |feature_value| into |low_value_vector|, if it is one of the 16
// smallest values the last 100 frames. Then calculates and returns the median
// of the five smallest values.
fun FindMinimum(self: VadInstT, feature_value: Int, channel: Int): Int {
	var i = 0
	var j = 0
	var position = -1
	// Offset to beginning of the 16 minimum values in memory.
	val offset = channel shl 4
	var current_median = 1600;
	var alpha = 0;
	var tmp32 = 0;
	// Pointer to memory for the 16 minimum values and the age of each value of
	// the |channel|.
	Int * age = &self.index_vector[offset];
	Int * smallest_values = &self.low_value_vector[offset];

	assert(channel < kNumChannels);

	// Each value in |smallest_values| is getting 1 loop older. Update |age|, and
	// remove old values.
	for (i = 0; i < 16; i++) {
		if (age[i] != 100) {
			age[i]++;
		} else {
			// Too old value. Remove from memory and shift larger values downwards.
			for (j = i; j < 16; j++) {
				smallest_values[j] = smallest_values[j + 1];
				age[j] = age[j + 1];
			}
			age[15] = 101;
			smallest_values[15] = 10000;
		}
	}

	// Check if |feature_value| is smaller than any of the values in
	// |smallest_values|. If so, find the |position| where to insert the new value
	// (|feature_value|).
	if (feature_value < smallest_values[7]) {
		if (feature_value < smallest_values[3]) {
			if (feature_value < smallest_values[1]) {
				if (feature_value < smallest_values[0]) {
					position = 0;
				} else {
					position = 1;
				}
			} else if (feature_value < smallest_values[2]) {
				position = 2;
			} else {
				position = 3;
			}
		} else if (feature_value < smallest_values[5]) {
			if (feature_value < smallest_values[4]) {
				position = 4;
			} else {
				position = 5;
			}
		} else if (feature_value < smallest_values[6]) {
			position = 6;
		} else {
			position = 7;
		}
	} else if (feature_value < smallest_values[15]) {
		if (feature_value < smallest_values[11]) {
			if (feature_value < smallest_values[9]) {
				if (feature_value < smallest_values[8]) {
					position = 8;
				} else {
					position = 9;
				}
			} else if (feature_value < smallest_values[10]) {
				position = 10;
			} else {
				position = 11;
			}
		} else if (feature_value < smallest_values[13]) {
			if (feature_value < smallest_values[12]) {
				position = 12;
			} else {
				position = 13;
			}
		} else if (feature_value < smallest_values[14]) {
			position = 14;
		} else {
			position = 15;
		}
	}

	// If we have detected a new small value, insert it at the correct position
	// and shift larger values up.
	if (position > -1) {
		for (i = 15; i > position; i--) {
			smallest_values[i] = smallest_values[i - 1];
			age[i] = age[i - 1];
		}
		smallest_values[position] = feature_value;
		age[position] = 1;
	}

	// Get |current_median|.
	if (self.frame_counter > 2) {
		current_median = smallest_values[2];
	} else if (self.frame_counter > 0) {
		current_median = smallest_values[0];
	}

	// Smooth the median value.
	if (self.frame_counter > 0) {
		if (current_median < self.mean_value[channel]) {
			alpha = kSmoothingDown; // 0.2 in Q15.
		} else {
			alpha = kSmoothingUp; // 0.99 in Q15.
		}
	}
	tmp32 = (alpha + 1) * self.mean_value[channel];
	tmp32 += (WEBRTC_SPL_WORD16_MAX - alpha) * current_median;
	tmp32 += 16384;
	self.mean_value[channel] = (Int)(tmp32 shr 15);

	return self.mean_value[channel];
}

///////////////////////////////////////////////////////////////////////////////////////////////////////
// webrtc/common_audio/vad/vad_filterbank.c

// Constants used in LogOfEnergy().
val Int kLogConst = 24660; // 160*log10(2) in Q9.
val Int kLogEnergyIntPart = 14336; // 14 in Q10

// Coefficients used by HighPassFilter, Q14.
val Int kHpZeroCoefs[3] = { 6631, -13262, 6631 };
val Int kHpPoleCoefs[3] = { 16384, -7756, 5620 };

// Allpass filter coefficients, upper and lower, in Q15.
// Upper: 0.64, Lower: 0.17
val Int kAllPassCoefsQ15[2] = { 20972, 5571 };

// Adjustment for division with two in SplitFilter.
val Int kOffsetVector[6] = { 368, 368, 272, 176, 176, 176 };

// High pass filtering, with a cut-off frequency at 80 Hz, if the |data_in| is
// sampled at 500 Hz.
//
// - data_in      [i]   : Input audio data sampled at 500 Hz.
// - data_length  [i]   : Length of input and output data.
// - filter_state [i/o] : State of the filter.
// - data_out     [o]   : Output audio data in the frequency interval
//                        80 - 250 Hz.
void HighPassFilter(val Sample* data_in, Int data_length, Int* filter_state, Sample* data_out) {
	Int i;
	val Sample* in_ptr = data_in;
	Sample* out_ptr = data_out;
	Int tmp32 = 0;


	// The sum of the absolute values of the impulse response:
	// The zero/pole-filter has a max amplification of a single sample of: 1.4546
	// Impulse response: 0.4047 -0.6179 -0.0266  0.1993  0.1035  -0.0194
	// The all-zero section has a max amplification of a single sample of: 1.6189
	// Impulse response: 0.4047 -0.8094  0.4047  0       0        0
	// The all-pole section has a max amplification of a single sample of: 1.9931
	// Impulse response: 1.0000  0.4734 -0.1189 -0.2187 -0.0627   0.04532

	for (i = 0; i < data_length; i++) {
		// All-zero section (filter coefficients in Q14).
		tmp32 = kHpZeroCoefs[0] * *in_ptr;
		tmp32 += kHpZeroCoefs[1] * filter_state[0];
		tmp32 += kHpZeroCoefs[2] * filter_state[1];
		filter_state[1] = filter_state[0];
		filter_state[0] = *in_ptr++;

		// All-pole section (filter coefficients in Q14).
		tmp32 -= kHpPoleCoefs[1] * filter_state[2];
		tmp32 -= kHpPoleCoefs[2] * filter_state[3];
		filter_state[3] = filter_state[2];
		filter_state[2] = (Int)(tmp32 shr 14);
		*out_ptr++ = filter_state[2];
	}
}

// All pass filtering of |data_in|, used before splitting the signal into two
// frequency bands (low pass vs high pass).
// Note that |data_in| and |data_out| can NOT correspond to the same address.
//
// - data_in            [i]   : Input audio signal given in Q0.
// - data_length        [i]   : Length of input and output data.
// - filter_coefficient [i]   : Given in Q15.
// - filter_state       [i/o] : State of the filter given in Q(-1).
// - data_out           [o]   : Output audio signal given in Q(-1).
void AllPassFilter(val Sample* data_in, Int data_length,
	Int filter_coefficient, Int* filter_state,
	Sample* data_out) {
	// The filter can only cause overflow (in the w16 output variable)
	// if more than 4 consecutive input numbers are of maximum value and
	// has the the same sign as the impulse responses first taps.
	// First 6 taps of the impulse response:
	// 0.6399 0.5905 -0.3779 0.2418 -0.1547 0.0990

	Int i;
	Int tmp16 = 0;
	Int tmp32 = 0;
	Int state32 = ((Int)(*filter_state) * (1 shl 16)); // Q15

	for (i = 0; i < data_length; i++) {
		tmp32 = state32 + filter_coefficient * *data_in;
		tmp16 = (Int)(tmp32 shr 16); // Q(-1)
		*data_out++ = tmp16;
		state32 = (*data_in * (1 shl 14)) - filter_coefficient * tmp16; // Q14
		state32 *= 2; // Q15.
		data_in += 2;
	}

	*filter_state = (Int)(state32 shr 16); // Q(-1)
}

// Splits |data_in| into |hp_data_out| and |lp_data_out| corresponding to
// an upper (high pass) part and a lower (low pass) part respectively.
//
// - data_in      [i]   : Input audio data to be split into two frequency bands.
// - data_length  [i]   : Length of |data_in|.
// - upper_state  [i/o] : State of the upper filter, given in Q(-1).
// - lower_state  [i/o] : State of the lower filter, given in Q(-1).
// - hp_data_out  [o]   : Output audio data of the upper half of the spectrum.
//                        The length is |data_length| / 2.
// - lp_data_out  [o]   : Output audio data of the lower half of the spectrum.
//                        The length is |data_length| / 2.
void SplitFilter(val Sample* data_in, Int data_length,
	Int* upper_state, Int* lower_state,
	Sample* hp_data_out, Sample* lp_data_out) {
	Int i;
	Int half_length = data_length shr 1; // Downsampling by 2.
	Int tmp_out;

	// All-pass filtering upper branch.
	AllPassFilter(&data_in[0], half_length, kAllPassCoefsQ15[0], upper_state, hp_data_out);

	// All-pass filtering lower branch.
	AllPassFilter(&data_in[1], half_length, kAllPassCoefsQ15[1], lower_state, lp_data_out);

	// Make LP and HP signals.
	for (i = 0; i < half_length; i++) {
		tmp_out = *hp_data_out;
		*hp_data_out++ -= *lp_data_out;
		*lp_data_out++ += tmp_out;
	}
}

// Calculates the energy of |data_in| in dB, and also updates an overall
// |total_energy| if necessary.
//
// - data_in      [i]   : Input audio data for energy calculation.
// - data_length  [i]   : Length of input data.
// - offset       [i]   : Offset value added to |log_energy|.
// - total_energy [i/o] : An external energy updated with the energy of
//                        |data_in|.
//                        NOTE: |total_energy| is only updated if
//                        |total_energy| <= |kMinEnergy|.
// - log_energy   [o]   : 10 * log10("energy of |data_in|") given in Q4.
void LogOfEnergy(val Sample* data_in, Int data_length,
	Int offset, Int* total_energy,
	Int* log_energy) {
	// |tot_rshifts| accumulates the number of right shifts performed on |energy|.
	Int tot_rshifts = 0;
	// The |energy| will be normalized to 15 bits. We use unsigned integer because
	// we eventually will mask out the fractional part.
	UInt energy = 0;

	assert(data_in != NULL);
	assert(data_length > 0);

	energy = (UInt)Energy((Int*)data_in, data_length,
		&tot_rshifts);

	if (energy != 0) {
		// By construction, normalizing to 15 bits is equivalent with 17 leading
		// zeros of an unsigned 32 bit value.
		Int normalizing_rshifts = 17 - NormU32(energy);
		// In a 15 bit representation the leading bit is 2^14. log2(2^14) in Q10 is
		// (14 shl 10), which is what we initialize |log2_energy| with. For a more
		// detailed derivations, see below.
		Int log2_energy = kLogEnergyIntPart;

		tot_rshifts += normalizing_rshifts;
		// Normalize |energy| to 15 bits.
		// |tot_rshifts| is now the total number of right shifts performed on
		// |energy| after normalization. This means that |energy| is in
		// Q(-tot_rshifts).
		if (normalizing_rshifts < 0) {
			energy shl= -normalizing_rshifts;
		} else {
			energy shr= normalizing_rshifts;
		}

		// Calculate the energy of |data_in| in dB, in Q4.
		//
		// 10 * log10("true energy") in Q4 = 2^4 * 10 * log10("true energy") =
		// 160 * log10(|energy| * 2^|tot_rshifts|) =
		// 160 * log10(2) * log2(|energy| * 2^|tot_rshifts|) =
		// 160 * log10(2) * (log2(|energy|) + log2(2^|tot_rshifts|)) =
		// (160 * log10(2)) * (log2(|energy|) + |tot_rshifts|) =
		// |kLogConst| * (|log2_energy| + |tot_rshifts|)
		//
		// We know by construction that |energy| is normalized to 15 bits. Hence,
		// |energy| = 2^14 + frac_Q15, where frac_Q15 is a fractional part in Q15.
		// Further, we'd like |log2_energy| in Q10
		// log2(|energy|) in Q10 = 2^10 * log2(2^14 + frac_Q15) =
		// 2^10 * log2(2^14 * (1 + frac_Q15 * 2^-14)) =
		// 2^10 * (14 + log2(1 + frac_Q15 * 2^-14)) ~=
		// (14 shl 10) + 2^10 * (frac_Q15 * 2^-14) =
		// (14 shl 10) + (frac_Q15 * 2^-4) = (14 shl 10) + (frac_Q15 shr 4)
		//
		// Note that frac_Q15 = (|energy| & 0x00003FFF)

		// Calculate and add the fractional part to |log2_energy|.
		log2_energy += (Int)((energy & 0x00003FFF) shr 4);

		// |kLogConst| is in Q9, |log2_energy| in Q10 and |tot_rshifts| in Q0.
		// Note that we in our derivation above have accounted for an output in Q4.
		*log_energy = (Int)(((kLogConst * log2_energy) shr 19) +
			((tot_rshifts * kLogConst) shr 9));

		if (*log_energy < 0) {
			*log_energy = 0;
		}
	} else {
		*log_energy = offset;
		return;
	}

	*log_energy += offset;

	// Update the approximate |total_energy| with the energy of |data_in|, if
	// |total_energy| has not exceeded |kMinEnergy|. |total_energy| is used as an
	// energy indicator in GmmProbability() in vad_core.c.
	if (*total_energy <= kMinEnergy) {
		if (tot_rshifts >= 0) {
			// We know by construction that the |energy| > |kMinEnergy| in Q0, so add
			// an arbitrary value such that |total_energy| exceeds |kMinEnergy|.
			*total_energy += kMinEnergy + 1;
		} else {
			// By construction |energy| is represented by 15 bits, hence any number of
			// right shifted |energy| will fit in an Int. In addition, adding the
			// value to |total_energy| is wrap around safe as long as
			// |kMinEnergy| < 8192.
			*total_energy += (Int)(energy shr -tot_rshifts); // Q0.
		}
	}
}

// Takes |data_length| samples of |data_in| and calculates the logarithm of the
// energy of each of the |kNumChannels| = 6 frequency bands used by the VAD:
//        80 Hz - 250 Hz
//        250 Hz - 500 Hz
//        500 Hz - 1000 Hz
//        1000 Hz - 2000 Hz
//        2000 Hz - 3000 Hz
//        3000 Hz - 4000 Hz
//
// The values are given in Q4 and written to |features|. Further, an approximate
// overall energy is returned. The return value is used in
// GmmProbability() as a signal indicator, hence it is arbitrary above
// the threshold |kMinEnergy|.
//
// - self         [i/o] : State information of the VAD.
// - data_in      [i]   : Input audio data, for feature extraction.
// - data_length  [i]   : Audio data size, in number of samples.
// - features     [o]   : 10 * log10(energy in each frequency band), Q4.
// - returns            : Total energy of the signal (NOTE! This value is not
//                        exact. It is only used in a comparison.)
Int CalculateFeatures(VadInstT* self, val Sample* data_in, Int data_length, Int* features) {
	Int total_energy = 0;
	// We expect |data_length| to be 80, 160 or 240 samples, which corresponds to
	// 10, 20 or 30 ms in 8 kHz. Therefore, the intermediate downsampled data will
	// have at most 120 samples after the first split and at most 60 samples after
	// the second split.
	Sample hp_120[120], lp_120[120];
	Sample hp_60[60], lp_60[60];
	val Int half_data_length = data_length shr 1;
	Int length = half_data_length; // |data_length| / 2, corresponds to
									 // bandwidth = 2000 Hz after downsampling.

	// Initialize variables for the first SplitFilter().
	Int frequency_band = 0;
	val Sample* in_ptr = data_in; // [0 - 4000] Hz.
	Sample* hp_out_ptr = hp_120; // [2000 - 4000] Hz.
	Sample* lp_out_ptr = lp_120; // [0 - 2000] Hz.

	assert(data_length <= 240);
	assert(4 < kNumChannels - 1); // Checking maximum |frequency_band|.

	// Split at 2000 Hz and downsample.
	SplitFilter(in_ptr, data_length, &self.upper_state[frequency_band],
		&self.lower_state[frequency_band], hp_out_ptr, lp_out_ptr);

	// For the upper band (2000 Hz - 4000 Hz) split at 3000 Hz and downsample.
	frequency_band = 1;
	in_ptr = hp_120; // [2000 - 4000] Hz.
	hp_out_ptr = hp_60; // [3000 - 4000] Hz.
	lp_out_ptr = lp_60; // [2000 - 3000] Hz.
	SplitFilter(in_ptr, length, &self.upper_state[frequency_band],
		&self.lower_state[frequency_band], hp_out_ptr, lp_out_ptr);

	// Energy in 3000 Hz - 4000 Hz.
	length shr= 1; // |data_length| / 4 <=> bandwidth = 1000 Hz.

	LogOfEnergy(hp_60, length, kOffsetVector[5], &total_energy, &features[5]);

	// Energy in 2000 Hz - 3000 Hz.
	LogOfEnergy(lp_60, length, kOffsetVector[4], &total_energy, &features[4]);

	// For the lower band (0 Hz - 2000 Hz) split at 1000 Hz and downsample.
	frequency_band = 2;
	in_ptr = lp_120; // [0 - 2000] Hz.
	hp_out_ptr = hp_60; // [1000 - 2000] Hz.
	lp_out_ptr = lp_60; // [0 - 1000] Hz.
	length = half_data_length; // |data_length| / 2 <=> bandwidth = 2000 Hz.
	SplitFilter(in_ptr, length, &self.upper_state[frequency_band],
		&self.lower_state[frequency_band], hp_out_ptr, lp_out_ptr);

	// Energy in 1000 Hz - 2000 Hz.
	length shr= 1; // |data_length| / 4 <=> bandwidth = 1000 Hz.
	LogOfEnergy(hp_60, length, kOffsetVector[3], &total_energy, &features[3]);

	// For the lower band (0 Hz - 1000 Hz) split at 500 Hz and downsample.
	frequency_band = 3;
	in_ptr = lp_60; // [0 - 1000] Hz.
	hp_out_ptr = hp_120; // [500 - 1000] Hz.
	lp_out_ptr = lp_120; // [0 - 500] Hz.
	SplitFilter(in_ptr, length, &self.upper_state[frequency_band],
		&self.lower_state[frequency_band], hp_out_ptr, lp_out_ptr);

	// Energy in 500 Hz - 1000 Hz.
	length shr= 1; // |data_length| / 8 <=> bandwidth = 500 Hz.
	LogOfEnergy(hp_120, length, kOffsetVector[2], &total_energy, &features[2]);

	// For the lower band (0 Hz - 500 Hz) split at 250 Hz and downsample.
	frequency_band = 4;
	in_ptr = lp_120; // [0 - 500] Hz.
	hp_out_ptr = hp_60; // [250 - 500] Hz.
	lp_out_ptr = lp_60; // [0 - 250] Hz.
	SplitFilter(in_ptr, length, &self.upper_state[frequency_band],
		&self.lower_state[frequency_band], hp_out_ptr, lp_out_ptr);

	// Energy in 250 Hz - 500 Hz.
	length shr= 1; // |data_length| / 16 <=> bandwidth = 250 Hz.
	LogOfEnergy(hp_60, length, kOffsetVector[1], &total_energy, &features[1]);

	// Remove 0 Hz - 80 Hz, by high pass filtering the lower band.
	HighPassFilter(lp_60, length, self.hp_filter_state, hp_120);

	// Energy in 80 Hz - 250 Hz.
	LogOfEnergy(hp_120, length, kOffsetVector[0], &total_energy, &features[0]);

	return total_energy;
}

///////////////////////////////////////////////////////////////////////////////////////////////////////
// webrtc/common_audio/vad/webrtc_vad.c

// This function was moved from vad_core.c.
/****************************************************************************
 * CalcVad48khz(...)
 * CalcVad32khz(...)
 * CalcVad16khz(...)
 * CalcVad8khz(...)
 *
 * Calculate probability for active speech and make VAD decision.
 *
 * Input:
 *      - inst          : Instance that should be initialized
 *      - speech_frame  : Input speech frame
 *      - frame_length  : Number of input samples
 *
 * Output:
 *      - inst          : Updated filter states etc.
 *
 * Return value         : VAD decision
 *                        0 - No active speech
 *                        1-6 - Active speech
 */
Int CalcVad8khz(VadInstT* inst, val Sample* speech_frame, Int frame_length) {
	Int feature_vector[kNumChannels], total_power;

	// Get power in the bands
	total_power = CalculateFeatures(inst, speech_frame, frame_length,
		feature_vector);

	// Make a VAD
	inst->vad = GmmProbability(inst, feature_vector, total_power, frame_length);

	return inst->vad;
}

// Calculates a VAD decision for the |audio_frame|. For valid sampling rates
// frame lengths, see the description of ValidRatesAndFrameLengths().
//
// - handle       [i/o] : VAD Instance. Needs to be initialized by
//                        InitVadInst() before call.
// - fs           [i]   : Sampling frequency (Hz): 8000, 16000, or 32000
// - audio_frame  [i]   : Audio frame buffer.
// - frame_length [i]   : Length of audio frame buffer in number of samples.
//
// returns              : 1 - (Active Voice),
//                        0 - (Non-active Voice),
//                       -1 - (Error)
Int ProcessVad(VadInst* handle, Int fs, val Sample* audio_frame, Int frame_length) {
	Int vad = -1;
	VadInstT* self = (VadInstT*)handle;

	if (handle == NULL) {
		return -1;
	}

	if (self.init_flag != kInitCheck) {
		return -1;
	}
	if (audio_frame == NULL) {
		return -1;
	}

	if (fs != 8000) return -1;
	vad = CalcVad8khz(self, audio_frame, frame_length);

	if (vad > 0) {
		vad = 1;
	}
	return vad;
}