From 273ba16bf835a67f389f6b641ce374afb5dbf56c Mon Sep 17 00:00:00 2001
From: Daniel Wolf <dwolf@dannad.de>
Date: Fri, 4 Oct 2019 17:41:53 +0200
Subject: [PATCH] Transform to idiomatic Kotlin

---
 rhubarb/src/main/kotlin/vad.kt                | 1139 -----------------
 .../SignalProcessing.kt                       |  298 +++++
 .../VoiceActivityDetector.kt                  |  734 +++++++++++
 3 files changed, 1032 insertions(+), 1139 deletions(-)
 delete mode 100644 rhubarb/src/main/kotlin/vad.kt
 create mode 100644 rhubarb/src/main/kotlin/voice_activity_detection/SignalProcessing.kt
 create mode 100644 rhubarb/src/main/kotlin/voice_activity_detection/VoiceActivityDetector.kt

diff --git a/rhubarb/src/main/kotlin/vad.kt b/rhubarb/src/main/kotlin/vad.kt
deleted file mode 100644
index 60eef60..0000000
--- a/rhubarb/src/main/kotlin/vad.kt
+++ /dev/null
@@ -1,1139 +0,0 @@
-import org.apache.commons.lang3.mutable.MutableInt
-import kotlin.math.absoluteValue
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-// webrtc/common_audio/signal_processing/include/spl_inl.h
-// webrtc/common_audio/signal_processing/spl_inl.c
-
-/**
- * Table used by getLeadingZeroCount.
- * For each UInt n that's a sequence of 0 bits followed by a sequence of 1 bits, the entry at index
- * (n * 0x8c0b2891) shr 26 in this table gives the number of zero bits in n.
- */
-val leadingZerosTable = intArrayOf(
-	32, 8,  17, -1, -1, 14, -1, -1, -1, 20, -1, -1, -1, 28, -1, 18,
-	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0,  26, 25, 24,
-	4,  11, 23, 31, 3,  7,  10, 16, 22, 30, -1, -1, 2,  6,  13, 9,
-	-1, 15, -1, 21, -1, 29, 19, -1, -1, -1, -1, -1, 1,  27, 5,  12
-).apply { assert(size == 64) }
-
-/** Returns the number of leading zero bits in the argument. */
-fun getLeadingZeroCount(n: UInt): Int {
-	// Normalize n by rounding up to the nearest number that is a sequence of 0 bits followed by a
-	// sequence of 1 bits. This number has the same number of leading zeros as the original n.
-	// There are exactly 33 such values.
-	var normalized = n
-	normalized = normalized or (normalized shr 1)
-	normalized = normalized or (normalized shr 2)
-	normalized = normalized or (normalized shr 4)
-	normalized = normalized or (normalized shr 8)
-	normalized = normalized or (normalized shr 16)
-
-	// Multiply the modified n with a constant selected (by exhaustive search) such that each of the
-	// 33 possible values of n give a product whose 6 most significant bits are unique.
-	// Then look up the answer in the table.
-	return leadingZerosTable[((normalized * 0x8c0b2891u) shr 26).toInt()]
-}
-
-/**
- * Returns the number of bits by which a signed int can be left-shifted without overflow, or 0 if
- * a == 0.
- */
-fun normSigned(a: Int): Int =
-	if (a == 0)
-		0
-	else
-		getLeadingZeroCount((if (a < 0) a.inv() else a).toUInt()) - 1
-
-/**
- * Returns the number of bits by which an unsigned int can be left-shifted without overflow, or 0 if
- * a == 0.
- */
-fun normUnsigned(a: UInt): Int = if (a == 0u) 0 else getLeadingZeroCount(a)
-
-/** Returns the number of bits needed to represent the specified value. */
-fun getBitCount(n: UInt): Int = 32 - getLeadingZeroCount(n)
-
-/**
- * Returns the number of right bit shifts that must be applied to each of the given samples so that,
- * if the squares of the samples are added [times] times, the signed 32-bit addition will not
- * overflow.
-*/
-fun getScalingSquare(buffer: AudioBuffer, times: Int): Int {
-	var maxAbsSample = -1
-	for (i in 0 until buffer.size) {
-		val absSample = buffer[i].toInt().absoluteValue
-		if (absSample > maxAbsSample) {
-			maxAbsSample = absSample
-		}
-	}
-
-	if (maxAbsSample == 0) {
-		return 0 // Since norm(0) returns 0
-	}
-
-	val t = normSigned(maxAbsSample * maxAbsSample)
-	val bitCount = getBitCount(times.toUInt())
-	return if (t > bitCount) 0 else bitCount - t
-}
-
-data class EnergyResult(
-	/**
-	 * The number of left bit shifts needed to get the physical energy value, i.e, to get the Q0
-	 * value
-	 */
-	val rightShifts: Int,
-
-	/** The energy value in Q(-[scale_factor]) */
-	val energy: Int
-)
-
-/** Calculates the energy of an audio buffer. */
-fun getEnergy(buffer: AudioBuffer): EnergyResult {
-	val scaling = getScalingSquare(buffer, buffer.size)
-
-	var energy = 0
-	for (i in 0 until buffer.size) {
-		energy += (buffer[i] * buffer[i]) shr scaling
-	}
-
-	return EnergyResult(scaling, energy)
-}
-
-/** Performs a safe integer division, returning [Int.MAX_VALUE] if [denominator] = 0. */
-infix fun Int.safeDiv(denominator: Int) = if (denominator != 0) this / denominator else Int.MAX_VALUE
-
-data class GaussianProbabilityResult(
-	/** (probability for input) = 1 / std * exp(-(input - mean)^2 / (2 * std^2)) */
-	val probability: Int,
-
-	/**
-	 * Input used when updating the model, Q11.
-	 * delta = (input - mean) / std^2.
-	 */
-	val delta: Int
-)
-
-/**
- * Calculates the probability for [input], given that [input] comes from a normal distribution with
- * mean [mean] and standard deviation [std].
- *
- * @param [input] Input sample in Q4.
- * @param [mean] Mean input in the statistical model, Q7.
- * @param [std] Standard deviation, Q7.
-*/
-fun getGaussianProbability(input: Int, mean: Int, std: Int): GaussianProbabilityResult {
-	var tmp16 = 0
-	var invStd = 0
-	var invStd2 = 0
-	var expValue = 0
-	var tmp32 = 0
-
-	// Calculate invStd = 1 / s, in Q10.
-	// 131072 = 1 in Q17, and (std shr 1) is for rounding instead of truncation.
-	// Q-domain: Q17 / Q7 = Q10
-	tmp32 = 131072 + (std shr 1)
-	invStd = tmp32 safeDiv std
-
-	// Calculate inv_std2 = 1 / s^2, in Q14
-	tmp16 = invStd shr 2 // Q10 -> Q8.
-	// Q-domain: (Q8 * Q8) shr 2 = Q14
-	invStd2 = (tmp16 * tmp16) shr 2
-
-	tmp16 = input shl 3 // Q4 -> Q7
-	tmp16 -= mean // Q7 - Q7 = Q7
-
-	// To be used later, when updating noise/speech model.
-	// delta = (x - m) / s^2, in Q11.
-	// Q-domain: (Q14 * Q7) shr 10 = Q11
-	val delta = (invStd2 * tmp16) shr 10
-
-	// Calculate the exponent [tmp32] = (x - m)^2 / (2 * s^2), in Q10.
-	// Replacing division by two with one shift.
-	// Q-domain: (Q11 * Q7) shr 8 = Q10.
-	tmp32 = (delta * tmp16) shr 9
-
-	// If the exponent is small enough to give a non-zero probability, we calculate
-	// exp_value ~= exp(-(x - m)^2 / (2 * s^2))
-	//           ~= exp2(-log2(exp(1)) * tmp32)
-	val kCompVar = 22005
-	if (tmp32 < kCompVar) {
-		// Calculate [tmp16] = log2(exp(1)) * [tmp32], in Q10.
-		// Q-domain: (Q12 * Q10) shr 12 = Q10.
-		val kLog2Exp = 5909 // log2(exp(1)) in Q12.
-		tmp16 = (kLog2Exp * tmp32) shr 12
-		tmp16 = -tmp16
-		expValue = 0x0400 or (tmp16 and 0x03FF)
-		tmp16 = tmp16 xor 0xFFFF
-		tmp16 = tmp16 shr 10
-		tmp16 += 1
-		// Get [exp_value] = exp(-[tmp32]) in Q10.
-		expValue = expValue shr tmp16
-	}
-
-	// Calculate and return (1 / s) * exp(-(x - m)^2 / (2 * s^2)), in Q20.
-	// Q-domain: Q10 * Q10 = Q20.
-	val probability = invStd * expValue
-	return GaussianProbabilityResult(probability, delta)
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-// webrtc/common_audio/vad/vad_core.c
-
-// Spectrum Weighting
-val kSpectrumWeight = intArrayOf(6, 8, 10, 12, 14, 16).apply { assert(size == kNumChannels) }
-val kNoiseUpdateConst: Int = 655 // Q15
-val kSpeechUpdateConst: Int = 6554 // Q15
-val kBackEta: Int = 154 // Q8
-// Minimum difference between the two models, Q5
-val kMinimumDifference = intArrayOf(544, 544, 576, 576, 576, 576).apply { assert(size == kNumChannels) }
-// Upper limit of mean value for speech model, Q7
-val kMaximumSpeech = intArrayOf(11392, 11392, 11520, 11520, 11520, 11520).apply { assert(size == kNumChannels) }
-// Minimum value for mean value
-val kMinimumMean = intArrayOf(640, 768).apply { assert(size == kNumGaussians) }
-// Upper limit of mean value for noise model, Q7
-val kMaximumNoise = intArrayOf(9216, 9088, 8960, 8832, 8704, 8576).apply { assert(size == kNumChannels) }
-// Start values for the Gaussian models, Q7
-// Weights for the two Gaussians for the six channels (noise)
-val kNoiseDataWeights = intArrayOf(34, 62, 72, 66, 53, 25, 94, 66, 56, 62, 75, 103).apply { assert(size == kTableSize) }
-// Weights for the two Gaussians for the six channels (speech)
-val kSpeechDataWeights = intArrayOf(48, 82, 45, 87, 50, 47, 80, 46, 83, 41, 78, 81).apply { assert(size == kTableSize) }
-// Means for the two Gaussians for the six channels (noise)
-val kNoiseDataMeans = intArrayOf(6738, 4892, 7065, 6715, 6771, 3369, 7646, 3863, 7820, 7266, 5020, 4362).apply { assert(size == kTableSize) }
-// Means for the two Gaussians for the six channels (speech)
-val kSpeechDataMeans = intArrayOf(8306, 10085, 10078, 11823, 11843, 6309, 9473, 9571, 10879, 7581, 8180, 7483).apply { assert(size == kTableSize) }
-// Stds for the two Gaussians for the six channels (noise)
-val kNoiseDataStds = intArrayOf(378, 1064, 493, 582, 688, 593, 474, 697, 475, 688, 421, 455).apply { assert(size == kTableSize) }
-// Stds for the two Gaussians for the six channels (speech)
-val kSpeechDataStds = intArrayOf(555, 505, 567, 524, 585, 1231, 509, 828, 492, 1540, 1079, 850).apply { assert(size == kTableSize) }
-
-// Constants used in GmmProbability().
-//
-// Maximum number of counted speech (VAD = 1) frames in a row.
-val kMaxSpeechFrames: Int = 6
-// Minimum standard deviation for both speech and noise.
-val kMinStd: Int = 384
-
-// Number of frequency bands (named channels)
-val kNumChannels = 6
-
-// Number of Gaussians per channel in the GMM
-val kNumGaussians = 2
-
-// Index = gaussian * kNumChannels + channel
-val kTableSize = kNumChannels * kNumGaussians
-
-// Minimum energy required to trigger audio signal
-val kMinEnergy = 10
-
-/**
- * The VAD operating modes in order of increasing aggressiveness.
- * A more aggressive VAD is more restrictive in reporting speech. Put in other words, the
- * probability of being speech when the VAD returns 1 is increased with increasing mode. As a
- * consequence, the missed detection rate also goes up.
- */
-enum class Aggressiveness {
-	Quality,
-	LowBitrate,
-	Aggressive,
-	VeryAggressive
-}
-
-class VadInstT(aggressiveness: Aggressiveness = Aggressiveness.Quality) {
-	// General variables
-	var vad: Int = 1 // Speech active (=1)
-	// TODO(bjornv): Change to [frame_count].
-	var frame_counter: Int = 0
-	var over_hang: Int = 0
-	var num_of_speech: Int = 0
-
-	// PDF parameters
-	var noise_means = kNoiseDataMeans.clone()
-	var speech_means = kSpeechDataMeans.clone()
-	var noise_stds = kNoiseDataStds.clone()
-	var speech_stds = kSpeechDataStds.clone()
-
-	// Index vector
-	// TODO(bjornv): Change to [age_vector].
-	var index_vector = IntArray(16 * kNumChannels) { 0 }
-
-	// Minimum value vector
-	var low_value_vector = IntArray(16 * kNumChannels) { 10000 }
-
-	// Splitting filter states
-	var upper_state = List(5) { MutableInt(0) }
-	var lower_state = List(5) { MutableInt(0) }
-
-	// High pass filter states
-	var hp_filter_state = IntArray(4) { 0 }
-
-	// Mean value memory for FindMinimum()
-	// TODO(bjornv): Change to [median].
-	var mean_value = IntArray(kNumChannels) { 1600 }
-
-	// Thresholds
-	val over_hang_max_1: Int = when(aggressiveness) {
-		Aggressiveness.Quality, Aggressiveness.LowBitrate -> 8
-		Aggressiveness.Aggressive, Aggressiveness.VeryAggressive -> 6
-	}
-	val over_hang_max_2: Int = when(aggressiveness) {
-		Aggressiveness.Quality, Aggressiveness.LowBitrate -> 14
-		Aggressiveness.Aggressive, Aggressiveness.VeryAggressive -> 9
-	}
-	// TODO: Rename to localThreshold?
-	val individual: Int = when(aggressiveness) {
-		Aggressiveness.Quality -> 24
-		Aggressiveness.LowBitrate -> 37
-		Aggressiveness.Aggressive -> 82
-		Aggressiveness.VeryAggressive -> 94
-	}
-	// TODO: Rename to globalThreshold?
-	val total: Int = when(aggressiveness) {
-		Aggressiveness.Quality -> 57
-		Aggressiveness.LowBitrate -> 100
-		Aggressiveness.Aggressive -> 285
-		Aggressiveness.VeryAggressive -> 1100
-	}
-}
-
-typealias WebRtcVadInst = VadInstT
-
-// Calculates the weighted average w.r.t. number of Gaussians. The [data] are
-// updated with an [offset] before averaging.
-//
-// - data     [i/o] : Data to average.
-// - offset   [i]   : An offset to add to each element of [data].
-// - weights  [i]   : Weights used for averaging.
-//
-// returns          : The weighted average.
-fun WeightedAverage(data: IntArray, channel: Int, offset: Int, weights: IntArray): Int {
-	var result = 0
-	for (k in 0 until kNumGaussians) {
-		val index = k * kNumChannels + channel
-		data[index] += offset
-		result += data[index] * weights[index]
-	}
-	return result
-}
-
-// Calculates the probabilities for both speech and background noise using
-// Gaussian Mixture Models (GMM). A hypothesis-test is performed to decide which
-// type of signal is most probable.
-//
-// - self           [i/o] : Pointer to VAD instance
-// - features       [i]   : Feature vector of length [kNumChannels]
-//                          = log10(energy in frequency band)
-// - total_power    [i]   : Total power in audio frame.
-// - frame_length   [i]   : Number of input samples
-//
-// - returns              : the VAD decision (0 - noise, 1 - speech).
-fun GmmProbability(self: VadInstT, features: List<Int>, total_power: Int, frame_length: Int): Int {
-	var vadflag = 0
-	var tmp_s16: Int
-	var tmp1_s16: Int
-	var tmp2_s16: Int
-	val deltaN = IntArray(kTableSize)
-	val deltaS = IntArray(kTableSize)
-	val ngprvec = IntArray(kTableSize) { 0 } // Conditional probability = 0.
-	val sgprvec = IntArray(kTableSize) { 0 } // Conditional probability = 0.
-	var sum_log_likelihood_ratios = 0
-	val noise_probability = IntArray(kNumGaussians)
-	val speech_probability = IntArray(kNumGaussians)
-
-	assert(frame_length == 80)
-
-	if (total_power > kMinEnergy) {
-		// The signal power of current frame is large enough for processing. The
-		// processing consists of two parts:
-		// 1) Calculating the likelihood of speech and thereby a VAD decision.
-		// 2) Updating the underlying model, w.r.t., the decision made.
-
-		// The detection scheme is an LRT with hypothesis
-		// H0: Noise
-		// H1: Speech
-		//
-		// We combine a global LRT with local tests, for each frequency sub-band,
-		// here defined as [channel].
-		for (channel in 0 until kNumChannels) {
-			// For each channel we model the probability with a GMM consisting of
-			// [kNumGaussians], with different means and standard deviations depending
-			// on H0 or H1.
-			var h0_test = 0
-			var h1_test = 0
-			for (k in 0 until kNumGaussians) {
-				val gaussian = channel + k * kNumChannels
-
-				// Probability under H0, that is, probability of frame being noise.
-				// Value given in Q27 = Q7 * Q20.
-				val pNoise = getGaussianProbability(features[channel], self.noise_means[gaussian], self.noise_stds[gaussian])
-				deltaN[gaussian] = pNoise.delta
-				noise_probability[k] = kNoiseDataWeights[gaussian] * pNoise.probability
-				h0_test += noise_probability[k] // Q27
-
-				// Probability under H1, that is, probability of frame being speech.
-				// Value given in Q27 = Q7 * Q20.
-				val pSpeech = getGaussianProbability(features[channel], self.speech_means[gaussian], self.speech_stds[gaussian])
-				speech_probability[k] = kSpeechDataWeights[gaussian] * pSpeech.probability
-				deltaS[gaussian] = pSpeech.delta
-				h1_test += speech_probability[k] // Q27
-			}
-
-			// Calculate the log likelihood ratio: log2(Pr{X|H1} / Pr{X|H1}).
-			// Approximation:
-			// log2(Pr{X|H1} / Pr{X|H1}) = log2(Pr{X|H1}*2^Q) - log2(Pr{X|H1}*2^Q)
-			//                           = log2(h1_test) - log2(h0_test)
-			//                           = log2(2^(31-shifts_h1)*(1+b1))
-			//                             - log2(2^(31-shifts_h0)*(1+b0))
-			//                           = shifts_h0 - shifts_h1
-			//                             + log2(1+b1) - log2(1+b0)
-			//                          ~= shifts_h0 - shifts_h1
-			//
-			// Note that b0 and b1 are values less than 1, hence, 0 <= log2(1+b0) < 1.
-			// Further, b0 and b1 are independent and on the average the two terms cancel.
-			val shifts_h0 = if (h0_test != 0) normSigned(h0_test) else 31
-			val shifts_h1 = if (h1_test != 0) normSigned(h1_test) else 31
-			val log_likelihood_ratio = shifts_h0 - shifts_h1
-
-			// Update [sum_log_likelihood_ratios] with spectrum weighting. This is
-			// used for the global VAD decision.
-			sum_log_likelihood_ratios += log_likelihood_ratio * kSpectrumWeight[channel]
-
-			// Local VAD decision.
-			if ((log_likelihood_ratio * 4) > self.individual) {
-				vadflag = 1
-			}
-
-			// TODO(bjornv): The conditional probabilities below are applied on the
-			// hard coded number of Gaussians set to two. Find a way to generalize.
-			// Calculate local noise probabilities used later when updating the GMM.
-			val h0 = h0_test shr 12 // Q15
-			if (h0 > 0) {
-				// High probability of noise. Assign conditional probabilities for each
-				// Gaussian in the GMM.
-				val tmp = (noise_probability[0] and 0xFFFFF000u.toInt()) shl 2 // Q29
-				ngprvec[channel] = tmp safeDiv h0 // Q14
-				ngprvec[channel + kNumChannels] = 16384 - ngprvec[channel]
-			} else {
-				// Low noise probability. Assign conditional probability 1 to the first
-				// Gaussian and 0 to the rest (which is already set at initialization).
-				ngprvec[channel] = 16384
-			}
-
-			// Calculate local speech probabilities used later when updating the GMM.
-			val h1 = h1_test shr 12 // Q15
-			if (h1 > 0) {
-				// High probability of speech. Assign conditional probabilities for each
-				// Gaussian in the GMM. Otherwise use the initialized values, i.e., 0.
-				val tmp = (speech_probability[0] and 0xFFFFF000u.toInt()) shl 2 // Q29
-				sgprvec[channel] = tmp safeDiv h1 // Q14
-				sgprvec[channel + kNumChannels] = 16384 - sgprvec[channel]
-			}
-		}
-
-		// Make a global VAD decision.
-		vadflag = vadflag or (if (sum_log_likelihood_ratios >= self.total) 1 else 0)
-
-		// Update the model parameters.
-		var maxspe = 12800
-		for (channel in 0 until kNumChannels) {
-			// Get minimum value in past which is used for long term correction in Q4.
-			val feature_minimum = FindMinimum(self, features[channel], channel)
-
-			// Compute the "global" mean, that is the sum of the two means weighted.
-			var noise_global_mean = WeightedAverage(self.noise_means, channel, 0, kNoiseDataWeights)
-			tmp1_s16 = noise_global_mean shr 6 // Q8
-
-			for (k in 0 until kNumGaussians) {
-				val gaussian = channel + k * kNumChannels
-
-				val nmk = self.noise_means[gaussian]
-				val smk = self.speech_means[gaussian]
-				var nsk = self.noise_stds[gaussian]
-				var ssk = self.speech_stds[gaussian]
-
-				// Update noise mean vector if the frame consists of noise only.
-				var nmk2 = nmk
-				if (vadflag == 0) {
-					// deltaN = (x-mu)/sigma^2
-					// ngprvec[k] = |noise_probability[k]| /
-					//   (|noise_probability[0]| + |noise_probability[1]|)
-
-					// (Q14 * Q11 shr 11) = Q14.
-					val delt = (ngprvec[gaussian] * deltaN[gaussian]) shr 11
-					// Q7 + (Q14 * Q15 shr 22) = Q7.
-					nmk2 = nmk + ((delt * kNoiseUpdateConst) shr 22)
-				}
-
-				// Long term correction of the noise mean.
-				// Q8 - Q8 = Q8.
-				val ndelt = (feature_minimum shl 4) - tmp1_s16
-				// Q7 + (Q8 * Q8) shr 9 = Q7.
-				var nmk3 = nmk2 + ((ndelt * kBackEta) shr 9)
-
-				// Control that the noise mean does not drift to much.
-				tmp_s16 = (k + 5) shl 7
-				if (nmk3 < tmp_s16) {
-					nmk3 = tmp_s16
-				}
-				tmp_s16 = (72 + k - channel) shl 7
-				if (nmk3 > tmp_s16) {
-					nmk3 = tmp_s16
-				}
-				self.noise_means[gaussian] = nmk3
-
-				if (vadflag != 0) {
-					// Update speech mean vector:
-					// [deltaS] = (x-mu)/sigma^2
-					// sgprvec[k] = |speech_probability[k]| /
-					//   (|speech_probability[0]| + |speech_probability[1]|)
-
-					// (Q14 * Q11) shr 11 = Q14.
-					val delt = (sgprvec[gaussian] * deltaS[gaussian]) shr 11
-					// Q14 * Q15 shr 21 = Q8.
-					tmp_s16 = (delt * kSpeechUpdateConst) shr 21
-					// Q7 + (Q8 shr 1) = Q7. With rounding.
-					var smk2 = smk + ((tmp_s16 + 1) shr 1)
-
-					// Control that the speech mean does not drift to much.
-					val maxmu = maxspe + 640
-					if (smk2 < kMinimumMean[k]) {
-						smk2 = kMinimumMean[k]
-					}
-					if (smk2 > maxmu) {
-						smk2 = maxmu
-					}
-					self.speech_means[gaussian] = smk2 // Q7.
-
-					// (Q7 shr 3) = Q4. With rounding.
-					tmp_s16 = (smk + 4) shr 3
-
-					tmp_s16 = features[channel] - tmp_s16 // Q4
-					// (Q11 * Q4 shr 3) = Q12.
-					var tmp1_s32 = (deltaS[gaussian] * tmp_s16) shr 3
-					var tmp2_s32 = tmp1_s32 - 4096
-					tmp_s16 = sgprvec[gaussian] shr 2
-					// (Q14 shr 2) * Q12 = Q24.
-					tmp1_s32 = tmp_s16 * tmp2_s32
-
-					tmp2_s32 = tmp1_s32 shr 4 // Q20
-
-					// 0.1 * Q20 / Q7 = Q13.
-					if (tmp2_s32 > 0) {
-						tmp_s16 = tmp2_s32 safeDiv (ssk * 10)
-					} else {
-						tmp_s16 = -tmp2_s32 safeDiv (ssk * 10)
-						tmp_s16 = -tmp_s16
-					}
-					// Divide by 4 giving an update factor of 0.025 (= 0.1 / 4).
-					// Note that division by 4 equals shift by 2, hence,
-					// (Q13 shr 8) = (Q13 shr 6) / 4 = Q7.
-					tmp_s16 += 128 // Rounding.
-					ssk += (tmp_s16 shr 8)
-					if (ssk < kMinStd) {
-						ssk = kMinStd
-					}
-					self.speech_stds[gaussian] = ssk
-				} else {
-					// Update GMM variance vectors.
-					// deltaN * (features[channel] - nmk) - 1
-					// Q4 - (Q7 shr 3) = Q4.
-					tmp_s16 = features[channel] - (nmk shr 3)
-					// (Q11 * Q4 shr 3) = Q12.
-					var tmp1_s32 = (deltaN[gaussian] * tmp_s16) shr 3
-					tmp1_s32 -= 4096
-
-					// (Q14 shr 2) * Q12 = Q24.
-					tmp_s16 = (ngprvec[gaussian] + 2) shr 2
-					val tmp2_s32 = tmp_s16 * tmp1_s32
-					// Q20  * approx 0.001 (2^-10=0.0009766), hence,
-					// (Q24 shr 14) = (Q24 shr 4) / 2^10 = Q20.
-					tmp1_s32 = tmp2_s32 shr 14
-
-					// Q20 / Q7 = Q13.
-					if (tmp1_s32 > 0) {
-						tmp_s16 = tmp1_s32 safeDiv nsk
-					} else {
-						tmp_s16 = -tmp1_s32 safeDiv nsk
-						tmp_s16 = -tmp_s16
-					}
-					tmp_s16 += 32 // Rounding
-					nsk += tmp_s16 shr 6 // Q13 shr 6 = Q7.
-					if (nsk < kMinStd) {
-						nsk = kMinStd
-					}
-					self.noise_stds[gaussian] = nsk
-				}
-			}
-
-			// Separate models if they are too close.
-			// [noise_global_mean] in Q14 (= Q7 * Q7).
-			noise_global_mean = WeightedAverage(self.noise_means, channel, 0, kNoiseDataWeights)
-
-			// [speech_global_mean] in Q14 (= Q7 * Q7).
-			var speech_global_mean = WeightedAverage(self.speech_means, channel, 0, kSpeechDataWeights)
-
-			// [diff] = "global" speech mean - "global" noise mean.
-			// (Q14 shr 9) - (Q14 shr 9) = Q5.
-			val diff = (speech_global_mean shr 9) - (noise_global_mean shr 9)
-			if (diff < kMinimumDifference[channel]) {
-				tmp_s16 = kMinimumDifference[channel] - diff
-
-				// [tmp1_s16] = ~0.8 * (kMinimumDifference - diff) in Q7.
-				// [tmp2_s16] = ~0.2 * (kMinimumDifference - diff) in Q7.
-				tmp1_s16 = (13 * tmp_s16) shr 2
-				tmp2_s16 = (3 * tmp_s16) shr 2
-
-				// Move Gaussian means for speech model by [tmp1_s16] and update
-				// [speech_global_mean]. Note that |self.speech_means[channel]| is
-				// changed after the call.
-				speech_global_mean = WeightedAverage(self.speech_means, channel, tmp1_s16, kSpeechDataWeights)
-
-				// Move Gaussian means for noise model by -[tmp2_s16] and update
-				// [noise_global_mean]. Note that |self.noise_means[channel]| is
-				// changed after the call.
-				noise_global_mean = WeightedAverage(self.noise_means, channel, -tmp2_s16, kNoiseDataWeights)
-			}
-
-			// Control that the speech & noise means do not drift to much.
-			maxspe = kMaximumSpeech[channel]
-			tmp2_s16 = speech_global_mean shr 7
-			if (tmp2_s16 > maxspe) {
-				// Upper limit of speech model.
-				tmp2_s16 -= maxspe
-
-				for (k in 0 until kNumGaussians) {
-					self.speech_means[channel + k * kNumChannels] -= tmp2_s16
-				}
-			}
-
-			tmp2_s16 = noise_global_mean shr 7
-			if (tmp2_s16 > kMaximumNoise[channel]) {
-				tmp2_s16 -= kMaximumNoise[channel]
-
-				for (k in 0 until kNumGaussians) {
-					self.noise_means[channel + k * kNumChannels] -= tmp2_s16
-				}
-			}
-		}
-		self.frame_counter++
-	}
-
-	// Smooth with respect to transition hysteresis.
-	if (vadflag == 0) {
-		if (self.over_hang > 0) {
-			vadflag = 2 + self.over_hang
-			self.over_hang--
-		}
-		self.num_of_speech = 0
-	} else {
-		self.num_of_speech++
-		if (self.num_of_speech > kMaxSpeechFrames) {
-			self.num_of_speech = kMaxSpeechFrames
-			self.over_hang = self.over_hang_max_2
-		} else {
-			self.over_hang = self.over_hang_max_1
-		}
-	}
-	return vadflag
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-// webrtc/common_audio/vad/vad_sp.c
-
-val kSmoothingDown = 6553 // 0.2 in Q15.
-val kSmoothingUp = 32439 // 0.99 in Q15.
-
-// Updates and returns the smoothed feature minimum. As minimum we use the
-// median of the five smallest feature values in a 100 frames long window.
-// As long as |handle->frame_counter| is zero, that is, we haven't received any
-// "valid" data, FindMinimum() outputs the default value of 1600.
-//
-// Inputs:
-//      - feature_value : New feature value to update with.
-//      - channel       : Channel number.
-//
-// Input & Output:
-//      - handle        : State information of the VAD.
-//
-// Returns:
-//                      : Smoothed minimum value for a moving window.
-// Inserts [feature_value] into [low_value_vector], if it is one of the 16
-// smallest values the last 100 frames. Then calculates and returns the median
-// of the five smallest values.
-fun FindMinimum(self: VadInstT, feature_value: Int, channel: Int): Int {
-	var position = -1
-	var current_median = 1600
-	var alpha = 0
-	var tmp32 = 0
-	val offset = channel * 16
-
-	// Accessor for the age of each value of the [channel]
-	val age = object { // TODO: Inline?
-		operator fun get(i: Int) = self.index_vector[offset + i]
-		operator fun set(i: Int, value: Int) { self.index_vector[offset + i] = value }
-	}
-
-	// Accessor for the 16 minimum values of the [channel]
-	val smallest_values = object { // TODO: Inline?
-		operator fun get(i: Int) = self.low_value_vector[offset + i]
-		operator fun set(i: Int, value: Int) { self.low_value_vector[offset + i] = value }
-	}
-
-	assert(channel < kNumChannels)
-
-	// Each value in [smallest_values] is getting 1 loop older. Update [age], and
-	// remove old values.
-	for (i in 0 until 16) {
-		if (age[i] != 100) {
-			age[i]++
-		} else {
-			// Too old value. Remove from memory and shift larger values downwards.
-			for (j in i until 15) {
-				smallest_values[j] = smallest_values[j + 1]
-				age[j] = age[j + 1]
-			}
-			age[15] = 101
-			smallest_values[15] = 10000
-		}
-	}
-
-	// Check if [feature_value] is smaller than any of the values in
-	// [smallest_values]. If so, find the [position] where to insert the new value
-	// ([feature_value]).
-	if (feature_value < smallest_values[7]) {
-		if (feature_value < smallest_values[3]) {
-			if (feature_value < smallest_values[1]) {
-				if (feature_value < smallest_values[0]) {
-					position = 0
-				} else {
-					position = 1
-				}
-			} else if (feature_value < smallest_values[2]) {
-				position = 2
-			} else {
-				position = 3
-			}
-		} else if (feature_value < smallest_values[5]) {
-			if (feature_value < smallest_values[4]) {
-				position = 4
-			} else {
-				position = 5
-			}
-		} else if (feature_value < smallest_values[6]) {
-			position = 6
-		} else {
-			position = 7
-		}
-	} else if (feature_value < smallest_values[15]) {
-		if (feature_value < smallest_values[11]) {
-			if (feature_value < smallest_values[9]) {
-				if (feature_value < smallest_values[8]) {
-					position = 8
-				} else {
-					position = 9
-				}
-			} else if (feature_value < smallest_values[10]) {
-				position = 10
-			} else {
-				position = 11
-			}
-		} else if (feature_value < smallest_values[13]) {
-			if (feature_value < smallest_values[12]) {
-				position = 12
-			} else {
-				position = 13
-			}
-		} else if (feature_value < smallest_values[14]) {
-			position = 14
-		} else {
-			position = 15
-		}
-	}
-
-	// If we have detected a new small value, insert it at the correct position
-	// and shift larger values up.
-	if (position > -1) {
-		for (i in 15 downTo position + 1) {
-			smallest_values[i] = smallest_values[i - 1]
-			age[i] = age[i - 1]
-		}
-		smallest_values[position] = feature_value
-		age[position] = 1
-	}
-
-	// Get [current_median].
-	if (self.frame_counter > 2) {
-		current_median = smallest_values[2]
-	} else if (self.frame_counter > 0) {
-		current_median = smallest_values[0]
-	}
-
-	// Smooth the median value.
-	if (self.frame_counter > 0) {
-		if (current_median < self.mean_value[channel]) {
-			alpha = kSmoothingDown // 0.2 in Q15.
-		} else {
-			alpha = kSmoothingUp // 0.99 in Q15.
-		}
-	}
-	tmp32 = (alpha + 1) * self.mean_value[channel]
-	tmp32 += (Short.MAX_VALUE - alpha) * current_median
-	tmp32 += 16384
-	self.mean_value[channel] = tmp32 shr 15
-
-	return self.mean_value[channel]
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-// webrtc/common_audio/vad/vad_filterbank.c
-
-// Constants used in LogOfEnergy().
-val kLogConst = 24660 // 160*log10(2) in Q9.
-val kLogEnergyIntPart = 14336 // 14 in Q10
-
-// Coefficients used by HighPassFilter, Q14.
-val kHpZeroCoefs = intArrayOf(6631, -13262, 6631)
-val kHpPoleCoefs = intArrayOf(16384, -7756, 5620)
-
-// Allpass filter coefficients, upper and lower, in Q15.
-// Upper: 0.64, Lower: 0.17
-val kUpperAllPassCoefsQ15 = 20972
-val kLowerAllPassCoefsQ15 = 5571
-
-// Adjustment for division with two in SplitFilter.
-val kOffsetVector = intArrayOf(368, 368, 272, 176, 176, 176)
-
-// High pass filtering, with a cut-off frequency at 80 Hz, if the [data_in] is
-// sampled at 500 Hz.
-//
-// - data_in      [i]   : Input audio data sampled at 500 Hz.
-// - data_length  [i]   : Length of input and output data.
-// - filter_state [i/o] : State of the filter.
-// - data_out     [o]   : Output audio data in the frequency interval
-//                        80 - 250 Hz.
-fun HighPassFilter(input: AudioBuffer, filter_state: IntArray): AudioBuffer {
-	// The sum of the absolute values of the impulse response:
-	// The zero/pole-filter has a max amplification of a single sample of: 1.4546
-	// Impulse response: 0.4047 -0.6179 -0.0266  0.1993  0.1035  -0.0194
-	// The all-zero section has a max amplification of a single sample of: 1.6189
-	// Impulse response: 0.4047 -0.8094  0.4047  0       0        0
-	// The all-pole section has a max amplification of a single sample of: 1.9931
-	// Impulse response: 1.0000  0.4734 -0.1189 -0.2187 -0.0627   0.04532
-
-	val result = SampleArray(input.size)
-	for (i in 0 until input.size) {
-		// All-zero section (filter coefficients in Q14).
-		var tmp32 = kHpZeroCoefs[0] * input[i]
-		tmp32 += kHpZeroCoefs[1] * filter_state[0]
-		tmp32 += kHpZeroCoefs[2] * filter_state[1]
-		filter_state[1] = filter_state[0]
-		filter_state[0] = input[i].toInt()
-
-		// All-pole section (filter coefficients in Q14).
-		tmp32 -= kHpPoleCoefs[1] * filter_state[2]
-		tmp32 -= kHpPoleCoefs[2] * filter_state[3]
-		filter_state[3] = filter_state[2]
-		filter_state[2] = tmp32 shr 14
-		result[i] = filter_state[2].toShort()
-	}
-
-	return AudioBuffer(result)
-}
-
-// All pass filtering of [data_in], used before splitting the signal into two
-// frequency bands (low pass vs high pass).
-// Note that [data_in] and [data_out] can NOT correspond to the same address.
-//
-// - data_in            [i]   : Input audio signal given in Q0.
-// - data_length        [i]   : Length of input and output data.
-// - filter_coefficient [i]   : Given in Q15.
-// - filter_state       [i/o] : State of the filter given in Q(-1).
-// - data_out           [o]   : Output audio signal given in Q(-1).
-fun AllPassFilter(input: AudioBuffer, filter_coefficient: Int, filter_state: MutableInt): AudioBuffer {
-	// The filter can only cause overflow (in the w16 output variable)
-	// if more than 4 consecutive input numbers are of maximum value and
-	// has the the same sign as the impulse responses first taps.
-	// First 6 taps of the impulse response:
-	// 0.6399 0.5905 -0.3779 0.2418 -0.1547 0.0990
-
-	val result = SampleArray((input.size + 1) / 2)
-	var state32 = filter_state.toInt() * (1 shl 16) // Q15
-	for (i in 0 until input.size step 2) {
-		val tmp32 = state32 + filter_coefficient * input[i]
-		val tmp16 = tmp32 shr 16 // Q(-1)
-		result[i / 2] = tmp16.toShort()
-		state32 = input[i] * (1 shl 14) - filter_coefficient * tmp16 // Q14
-		state32 *= 2 // Q15.
-	}
-	filter_state.setValue(state32 shr 16) // Q(-1)
-
-	return AudioBuffer(result)
-}
-
-data class SplitFilterResult(
-	val highPassData: AudioBuffer,
-	val lowPassData: AudioBuffer
-)
-
-// Splits [data_in] into [hp_data_out] and [lp_data_out] corresponding to
-// an upper (high pass) part and a lower (low pass) part respectively.
-//
-// - data_in      [i]   : Input audio data to be split into two frequency bands.
-// - data_length  [i]   : Length of [data_in].
-// - upper_state  [i/o] : State of the upper filter, given in Q(-1).
-// - lower_state  [i/o] : State of the lower filter, given in Q(-1).
-// - hp_data_out  [o]   : Output audio data of the upper half of the spectrum.
-//                        The length is [data_length] / 2.
-// - lp_data_out  [o]   : Output audio data of the lower half of the spectrum.
-//                        The length is [data_length] / 2.
-fun SplitFilter(input: AudioBuffer, upper_state: MutableInt, lower_state: MutableInt): SplitFilterResult {
-	val resultSize = input.size / 2 // Downsampling by 2
-
-	// All-pass filtering upper branch.
-	val tempHighPass = AllPassFilter(input, kUpperAllPassCoefsQ15, upper_state)
-	assert(tempHighPass.size == resultSize)
-
-	// All-pass filtering lower branch.
-	val tempLowPass = AllPassFilter(AudioBuffer(input, 1), kLowerAllPassCoefsQ15, lower_state)
-	assert(tempLowPass.size == resultSize)
-
-	// Make LP and HP signals.
-	val highPassData = SampleArray(resultSize)
-	val lowPassData = SampleArray(resultSize)
-	for (i in 0 until resultSize) {
-		highPassData[i] = (tempHighPass[i] - tempLowPass[i]).toShort()
-		lowPassData[i] = (tempLowPass[i] + tempHighPass[i]).toShort()
-	}
-
-	return SplitFilterResult(AudioBuffer(highPassData), AudioBuffer(lowPassData))
-}
-
-// Calculates the energy of [data_in] in dB, and also updates an overall
-// [total_energy] if necessary.
-//
-// - data_in      [i]   : Input audio data for energy calculation.
-// - data_length  [i]   : Length of input data.
-// - offset       [i]   : Offset value added to [log_energy].
-// - total_energy [i/o] : An external energy updated with the energy of
-//                        [data_in].
-//                        NOTE: [total_energy] is only updated if
-//                        [total_energy] <= [kMinEnergy].
-// - log_energy   [o]   : 10 * log10("energy of [data_in]") given in Q4.
-fun LogOfEnergy(input: AudioBuffer, offset: Int, total_energy: MutableInt): Int {
-	assert(input.size > 0)
-
-	val energyResult = getEnergy(input)
-	// [tot_rshifts] accumulates the number of right shifts performed on [energy].
-	var tot_rshifts = energyResult.rightShifts
-	// The [energy] will be normalized to 15 bits. We use unsigned integer because
-	// we eventually will mask out the fractional part.
-	var energy = energyResult.energy.toUInt()
-
-	if (energy == 0u) {
-		return offset
-	}
-
-	// By construction, normalizing to 15 bits is equivalent with 17 leading
-	// zeros of an unsigned 32 bit value.
-	val normalizing_rshifts = 17 - normUnsigned(energy)
-	// In a 15 bit representation the leading bit is 2^14. log2(2^14) in Q10 is
-	// (14 shl 10), which is what we initialize [log2_energy] with. For a more
-	// detailed derivations, see below.
-	var log2_energy = kLogEnergyIntPart
-
-	tot_rshifts += normalizing_rshifts
-	// Normalize [energy] to 15 bits.
-	// [tot_rshifts] is now the total number of right shifts performed on
-	// [energy] after normalization. This means that [energy] is in
-	// Q(-tot_rshifts).
-	energy = if (normalizing_rshifts < 0)
-		energy shl -normalizing_rshifts
-	else
-		energy shr normalizing_rshifts
-
-	// Calculate the energy of [data_in] in dB, in Q4.
-	//
-	// 10 * log10("true energy") in Q4 = 2^4 * 10 * log10("true energy") =
-	// 160 * log10([energy] * 2^[tot_rshifts]) =
-	// 160 * log10(2) * log2([energy] * 2^[tot_rshifts]) =
-	// 160 * log10(2) * (log2([energy]) + log2(2^[tot_rshifts])) =
-	// (160 * log10(2)) * (log2([energy]) + [tot_rshifts]) =
-	// [kLogConst] * ([log2_energy] + [tot_rshifts])
-	//
-	// We know by construction that [energy] is normalized to 15 bits. Hence,
-	// [energy] = 2^14 + frac_Q15, where frac_Q15 is a fractional part in Q15.
-	// Further, we'd like [log2_energy] in Q10
-	// log2([energy]) in Q10 = 2^10 * log2(2^14 + frac_Q15) =
-	// 2^10 * log2(2^14 * (1 + frac_Q15 * 2^-14)) =
-	// 2^10 * (14 + log2(1 + frac_Q15 * 2^-14)) ~=
-	// (14 shl 10) + 2^10 * (frac_Q15 * 2^-14) =
-	// (14 shl 10) + (frac_Q15 * 2^-4) = (14 shl 10) + (frac_Q15 shr 4)
-	//
-	// Note that frac_Q15 = ([energy] & 0x00003FFF)
-
-	// Calculate and add the fractional part to [log2_energy].
-	log2_energy += ((energy and 0x00003FFFu) shr 4).toInt()
-
-	// [kLogConst] is in Q9, [log2_energy] in Q10 and [tot_rshifts] in Q0.
-	// Note that we in our derivation above have accounted for an output in Q4.
-	var log_energy = ((kLogConst * log2_energy) shr 19) + ((tot_rshifts * kLogConst) shr 9)
-
-	if (log_energy < 0) {
-		log_energy = 0
-	}
-
-	log_energy += offset
-
-	// Update the approximate [total_energy] with the energy of [data_in], if
-	// [total_energy] has not exceeded [kMinEnergy]. [total_energy] is used as an
-	// energy indicator in GmmProbability() in vad_core.c.
-	if (total_energy.toInt() <= kMinEnergy) {
-		if (tot_rshifts >= 0) {
-			// We know by construction that the [energy] > [kMinEnergy] in Q0, so add
-			// an arbitrary value such that [total_energy] exceeds [kMinEnergy].
-			total_energy.add(kMinEnergy + 1)
-		} else {
-			// By construction [energy] is represented by 15 bits, hence any number of
-			// right shifted [energy] will fit in an Int. In addition, adding the
-			// value to [total_energy] is wrap around safe as long as
-			// [kMinEnergy] < 8192.
-			total_energy.add((energy shr -tot_rshifts).toInt()) // Q0.
-		}
-	}
-
-	return log_energy
-}
-
-data class FeatureResult(
-	// 10 * log10(energy in each frequency band), Q4
-	val features: List<Int>,
-	// Total energy of the signal
-	// NOTE: This value is not exact. It is only used in a comparison.
-	val totalEnergy: Int
-)
-
-// Takes [data_length] samples of [data_in] and calculates the logarithm of the
-// energy of each of the [kNumChannels] = 6 frequency bands used by the VAD:
-//        80 Hz - 250 Hz
-//        250 Hz - 500 Hz
-//        500 Hz - 1000 Hz
-//        1000 Hz - 2000 Hz
-//        2000 Hz - 3000 Hz
-//        3000 Hz - 4000 Hz
-//
-// The values are given in Q4 and written to [features]. Further, an approximate
-// overall energy is returned. The return value is used in
-// GmmProbability() as a signal indicator, hence it is arbitrary above
-// the threshold [kMinEnergy].
-//
-// - self         [i/o] : State information of the VAD.
-// - data_in      [i]   : Input audio data, for feature extraction.
-// - data_length  [i]   : Audio data size, in number of samples.
-// - features     [o]   : 10 * log10(energy in each frequency band), Q4.
-// - returns            : Total energy of the signal (NOTE! This value is not
-//                        exact. It is only used in a comparison.)
-fun CalculateFeatures(self: VadInstT, input: AudioBuffer): FeatureResult {
-	assert(input.size == 80)
-
-	// Split at 2000 Hz and downsample.
-	var frequency_band = 0
-	val `0 to 4000 Hz` = input
-	val (`2000 to 4000 Hz`, `0 to 2000 Hz`) =
-		SplitFilter(`0 to 4000 Hz`, self.upper_state[frequency_band], self.lower_state[frequency_band])
-
-	// For the upper band (2000 to 4000 Hz) split at 3000 Hz and downsample.
-	frequency_band = 1
-	val (`3000 to 4000 Hz`, `2000 to 3000 Hz`) =
-		SplitFilter(`2000 to 4000 Hz`, self.upper_state[frequency_band], self.lower_state[frequency_band])
-
-	// For the lower band (0 to 2000 Hz) split at 1000 Hz and downsample.
-	frequency_band = 2
-	val (`1000 to 2000 Hz`, `0 to 1000 Hz`) =
-		SplitFilter(`0 to 2000 Hz`, self.upper_state[frequency_band], self.lower_state[frequency_band])
-
-	// For the lower band (0 to 1000 Hz) split at 500 Hz and downsample.
-	frequency_band = 3
-	val (`500 to 1000 Hz`, `0 to 500 Hz`) =
-		SplitFilter(`0 to 1000 Hz`, self.upper_state[frequency_band], self.lower_state[frequency_band])
-
-	// For the lower band (0 t0 500 Hz) split at 250 Hz and downsample.
-	frequency_band = 4
-	val (`250 to 500 Hz`, `0 to 250 Hz`) =
-		SplitFilter(`0 to 500 Hz`, self.upper_state[frequency_band], self.lower_state[frequency_band])
-
-	// Remove 0 to 80 Hz by high pass filtering the lower band.
-	val `80 to 250 Hz` = HighPassFilter(`0 to 250 Hz`, self.hp_filter_state)
-
-	val total_energy = MutableInt(0)
-	val `energy in 3000 to 4000 Hz` = LogOfEnergy(`3000 to 4000 Hz`, kOffsetVector[5], total_energy)
-	val `energy in 2000 to 3000 Hz` = LogOfEnergy(`2000 to 3000 Hz`, kOffsetVector[4], total_energy)
-	val `energy in 1000 to 2000 Hz` = LogOfEnergy(`1000 to 2000 Hz`, kOffsetVector[3], total_energy)
-	val `energy in 500 to 1000 Hz` = LogOfEnergy(`500 to 1000 Hz`, kOffsetVector[2], total_energy)
-	val `energy in 250 to 500 Hz` = LogOfEnergy(`250 to 500 Hz`, kOffsetVector[1], total_energy)
-	val `energy in 50 to 250 Hz` = LogOfEnergy(`80 to 250 Hz`, kOffsetVector[0], total_energy)
-
-	val features = listOf(
-		`energy in 50 to 250 Hz`,
-		`energy in 250 to 500 Hz`,
-		`energy in 500 to 1000 Hz`,
-		`energy in 1000 to 2000 Hz`,
-		`energy in 2000 to 3000 Hz`,
-		`energy in 3000 to 4000 Hz`
-	)
-	assert(features.size == kNumChannels)
-	return FeatureResult(features, total_energy.toInt())
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-// webrtc/common_audio/vad/webrtc_vad.c
-
-// This function was moved from vad_core.c.
-/****************************************************************************
- * CalcVad48khz(...)
- * CalcVad32khz(...)
- * CalcVad16khz(...)
- * CalcVad8khz(...)
- *
- * Calculate probability for active speech and make VAD decision.
- *
- * Input:
- *      - inst          : Instance that should be initialized
- *      - speech_frame  : Input speech frame
- *      - frame_length  : Number of input samples
- *
- * Output:
- *      - inst          : Updated filter states etc.
- *
- * Return value         : VAD decision
- *                        0 - No active speech
- *                        1-6 - Active speech
- */
-fun CalcVad8khz(inst: VadInstT, speech_frame: AudioBuffer): Int {
-	// Get power in the bands
-	val (features, totalEnergy) = CalculateFeatures(inst, speech_frame)
-
-	// Make a VAD
-	inst.vad = GmmProbability(inst, features, totalEnergy, speech_frame.size)
-
-	return inst.vad
-}
-
-// Calculates a VAD decision for the [audio_frame]. For valid sampling rates
-// frame lengths, see the description of ValidRatesAndFrameLengths().
-//
-// - handle       [i/o] : VAD Instance. Needs to be initialized by
-//                        InitVadInst() before call.
-// - fs           [i]   : Sampling frequency (Hz): 8000, 16000, or 32000
-// - audio_frame  [i]   : Audio frame buffer.
-// - frame_length [i]   : Length of audio frame buffer in number of samples.
-//
-// returns              : 1 - (Active Voice),
-//                        0 - (Non-active Voice),
-//                       -1 - (Error)
-fun ProcessVad(self: VadInstT, fs: Int, audio_frame: AudioBuffer): Boolean {
-	assert(fs == 8000)
-
-	val vad = CalcVad8khz(self, audio_frame)
-	// return vad != 0
-	return vad == 1
-}
diff --git a/rhubarb/src/main/kotlin/voice_activity_detection/SignalProcessing.kt b/rhubarb/src/main/kotlin/voice_activity_detection/SignalProcessing.kt
new file mode 100644
index 0000000..3936db4
--- /dev/null
+++ b/rhubarb/src/main/kotlin/voice_activity_detection/SignalProcessing.kt
@@ -0,0 +1,298 @@
+package voice_activity_detection
+
+import AudioBuffer
+import SampleArray
+import org.apache.commons.lang3.mutable.MutableInt
+import kotlin.math.absoluteValue
+
+/** Minimum energy required to trigger audio signal */
+const val MIN_ENERGY = 10
+
+private const val LOG_CONST = 24660 // 160*log10(2) in Q9.
+private const val LOG_ENERGY_INT_PART = 14336 // 14 in Q10
+
+private val HP_ZERO_COEFS = intArrayOf(6631, -13262, 6631)
+private val HP_POLE_COEFS = intArrayOf(16384, -7756, 5620)
+
+private const val UPPER_ALL_PASS_COEFS_Q15 = 20972 // 0.64
+private const val LOWER_ALL_PASS_COEFS_Q15 = 5571 // 0.17
+
+/**
+ * Table used by getLeadingZeroCount.
+ * For each UInt n that's a sequence of 0 bits followed by a sequence of 1 bits, the entry at index
+ * (n * 0x8c0b2891) shr 26 in this table gives the number of zero bits in n.
+ */
+private val LEADING_ZEROS_TABLE = intArrayOf(
+	32, 8,  17, -1, -1, 14, -1, -1, -1, 20, -1, -1, -1, 28, -1, 18,
+	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0,  26, 25, 24,
+	4,  11, 23, 31, 3,  7,  10, 16, 22, 30, -1, -1, 2,  6,  13, 9,
+	-1, 15, -1, 21, -1, 29, 19, -1, -1, -1, -1, -1, 1,  27, 5,  12
+).apply { assert(size == 64) }
+
+/** Returns the number of leading zero bits in the argument. */
+fun getLeadingZeroCount(n: UInt): Int {
+	// Normalize n by rounding up to the nearest number that is a sequence of 0 bits followed by a
+	// sequence of 1 bits. This number has the same number of leading zeros as the original n.
+	// There are exactly 33 such values.
+	var normalized = n
+	normalized = normalized or (normalized shr 1)
+	normalized = normalized or (normalized shr 2)
+	normalized = normalized or (normalized shr 4)
+	normalized = normalized or (normalized shr 8)
+	normalized = normalized or (normalized shr 16)
+
+	// Multiply the modified n with a constant selected (by exhaustive search) such that each of the
+	// 33 possible values of n give a product whose 6 most significant bits are unique.
+	// Then look up the answer in the table.
+	return LEADING_ZEROS_TABLE[((normalized * 0x8c0b2891u) shr 26).toInt()]
+}
+
+/**
+ * Returns the number of bits by which a signed int can be left-shifted without overflow, or 0 if
+ * a == 0.
+ */
+fun normSigned(a: Int): Int =
+	if (a == 0)
+		0
+	else
+		getLeadingZeroCount((if (a < 0) a.inv() else a).toUInt()) - 1
+
+/**
+ * Returns the number of bits by which an unsigned int can be left-shifted without overflow, or 0 if
+ * a == 0.
+ */
+fun normUnsigned(a: UInt): Int = if (a == 0u) 0 else getLeadingZeroCount(a)
+
+/** Returns the number of bits needed to represent the specified value. */
+fun getBitCount(n: UInt): Int = 32 - getLeadingZeroCount(n)
+
+/**
+ * Returns the number of right bit shifts that must be applied to each of the given samples so that,
+ * if the squares of the samples are added [times] times, the signed 32-bit addition will not
+ * overflow.
+ */
+fun getScalingSquare(buffer: AudioBuffer, times: Int): Int {
+	var maxAbsSample = -1
+	for (i in 0 until buffer.size) {
+		val absSample = buffer[i].toInt().absoluteValue
+		if (absSample > maxAbsSample) {
+			maxAbsSample = absSample
+		}
+	}
+
+	if (maxAbsSample == 0) {
+		return 0 // Since norm(0) returns 0
+	}
+
+	val t = normSigned(maxAbsSample * maxAbsSample)
+	val bitCount = getBitCount(times.toUInt())
+	return if (t > bitCount) 0 else bitCount - t
+}
+
+data class EnergyResult(
+	/**
+	 * The number of left bit shifts needed to get the physical energy value, i.e, to get the Q0
+	 * value
+	 */
+	val rightShifts: Int,
+
+	/** The energy value in Q(-[rightShifts]) */
+	val energy: Int
+)
+
+/** Calculates the energy of an audio buffer. */
+fun getEnergy(buffer: AudioBuffer): EnergyResult {
+	val scaling = getScalingSquare(buffer, buffer.size)
+
+	var energy = 0
+	for (i in 0 until buffer.size) {
+		energy += (buffer[i] * buffer[i]) shr scaling
+	}
+
+	return EnergyResult(scaling, energy)
+}
+
+/**
+ * Performs high pass filtering with a cut-off frequency at 80 Hz, if [input] is sampled at 500 Hz.
+ * @return Output audio data in the frequency interval 80 to 250 Hz.
+ */
+fun highPassFilter(input: AudioBuffer, filterState: IntArray): AudioBuffer {
+	// The sum of the absolute values of the impulse response:
+	// The zero/pole-filter has a max amplification of a single sample of: 1.4546
+	// Impulse response: 0.4047 -0.6179 -0.0266  0.1993  0.1035  -0.0194
+	// The all-zero section has a max amplification of a single sample of: 1.6189
+	// Impulse response: 0.4047 -0.8094  0.4047  0       0        0
+	// The all-pole section has a max amplification of a single sample of: 1.9931
+	// Impulse response: 1.0000  0.4734 -0.1189 -0.2187 -0.0627   0.04532
+
+	val result = SampleArray(input.size)
+	for (i in 0 until input.size) {
+		// All-zero section (filter coefficients in Q14).
+		var tmp32 = HP_ZERO_COEFS[0] * input[i]
+		tmp32 += HP_ZERO_COEFS[1] * filterState[0]
+		tmp32 += HP_ZERO_COEFS[2] * filterState[1]
+		filterState[1] = filterState[0]
+		filterState[0] = input[i].toInt()
+
+		// All-pole section (filter coefficients in Q14).
+		tmp32 -= HP_POLE_COEFS[1] * filterState[2]
+		tmp32 -= HP_POLE_COEFS[2] * filterState[3]
+		filterState[3] = filterState[2]
+		filterState[2] = tmp32 shr 14
+		result[i] = filterState[2].toShort()
+	}
+
+	return AudioBuffer(result)
+}
+
+/**
+ * Performs all pass filtering, used before splitting the signal into two frequency bands (low pass
+ * vs high pass).
+ * @param[filterCoefficient] Given in Q15.
+ * @param[filterState] State of the filter given in Q(-1).
+ * @return Output audio signal given in Q(-1).
+ */
+fun allPassFilter(input: AudioBuffer, filterCoefficient: Int, filterState: MutableInt): AudioBuffer {
+	// The filter can only cause overflow (in the w16 output variable) if more than 4 consecutive
+	// input numbers are of maximum value andhas the the same sign as the impulse responses first
+	// taps.
+	// First 6 taps of the impulse response:
+	// 0.6399 0.5905 -0.3779 0.2418 -0.1547 0.0990
+
+	val result = SampleArray((input.size + 1) / 2)
+	var state32 = filterState.toInt() * (1 shl 16) // Q15
+	for (i in 0 until input.size step 2) {
+		val tmp32 = state32 + filterCoefficient * input[i]
+		val tmp16 = tmp32 shr 16 // Q(-1)
+		result[i / 2] = tmp16.toShort()
+		state32 = input[i] * (1 shl 14) - filterCoefficient * tmp16 // Q14
+		state32 *= 2 // Q15.
+	}
+	filterState.setValue(state32 shr 16) // Q(-1)
+
+	return AudioBuffer(result)
+}
+
+data class SplitFilterResult(
+	val highPassData: AudioBuffer,
+	val lowPassData: AudioBuffer
+)
+
+/**
+ * Splits audio data into an upper (high pass) part and a lower (low pass) part.
+ * @param[upperState] State of the upper filter, given in Q(-1).
+ * @param[lowerState] State of the lower filter, given in Q(-1).
+ */
+fun splitFilter(input: AudioBuffer, upperState: MutableInt, lowerState: MutableInt): SplitFilterResult {
+	val resultSize = input.size / 2 // Downsampling by 2
+
+	// All-pass filtering upper branch.
+	val tempHighPass = allPassFilter(input, UPPER_ALL_PASS_COEFS_Q15, upperState)
+	assert(tempHighPass.size == resultSize)
+
+	// All-pass filtering lower branch.
+	val tempLowPass = allPassFilter(AudioBuffer(input, 1), LOWER_ALL_PASS_COEFS_Q15, lowerState)
+	assert(tempLowPass.size == resultSize)
+
+	// Make LP and HP signals.
+	val highPassData = SampleArray(resultSize)
+	val lowPassData = SampleArray(resultSize)
+	for (i in 0 until resultSize) {
+		highPassData[i] = (tempHighPass[i] - tempLowPass[i]).toShort()
+		lowPassData[i] = (tempLowPass[i] + tempHighPass[i]).toShort()
+	}
+
+	return SplitFilterResult(AudioBuffer(highPassData), AudioBuffer(lowPassData))
+}
+
+/**
+ * Calculates the energy of the input signal in dB, and also updates an overall [totalEnergy] if
+ * necessary.
+ * @param[offset] Offset value added to result.
+ * @param[totalEnergy] An external energy updated with the energy of the input signal.
+ *   NOTE: [totalEnergy] is only updated if [totalEnergy] <= [MIN_ENERGY].
+ * @return 10 * log10(energy of input signal) given in Q4.
+ */
+fun logOfEnergy(input: AudioBuffer, offset: Int, totalEnergy: MutableInt): Int {
+	assert(input.size > 0)
+
+	val energyResult = getEnergy(input)
+	// totalRightShifts accumulates the number of right shifts performed on energy.
+	var totalRightShifts = energyResult.rightShifts
+	// energy will be normalized to 15 bits. We use unsigned integer because we eventually will mask
+	// out the fractional part.
+	var energy = energyResult.energy.toUInt()
+
+	if (energy == 0u) {
+		return offset
+	}
+
+	// By construction, normalizing to 15 bits is equivalent with 17 leading zeros of an unsigned 32
+	// bit value.
+	val normalizingRightShifts = 17 - normUnsigned(energy)
+	// In a 15 bit representation the leading bit is 2^14. log2(2^14) in Q10 is
+	// (14 shl 10), which is what we initialize log2Energy with. For a more detailed derivations,
+	// see below.
+	var log2Energy = LOG_ENERGY_INT_PART
+
+	totalRightShifts += normalizingRightShifts
+	// Normalize energy to 15 bits.
+	// totalRightShifts is now the total number of right shifts performed on energy after
+	// normalization. This means that energy is in Q(-totalRightShifts).
+	energy = if (normalizingRightShifts < 0)
+		energy shl -normalizingRightShifts
+	else
+		energy shr normalizingRightShifts
+
+	// Calculate the energy ofinput in dB, in Q4.
+	//
+	// 10 * log10("true energy") in Q4 = 2^4 * 10 * log10("true energy") =
+	// 160 * log10(energy * 2^totalRightShifts) =
+	// 160 * log10(2) * log2(energy * 2^totalRightShifts) =
+	// 160 * log10(2) * (log2(energy) + log2(2^totalRightShifts)) =
+	// (160 * log10(2)) * (log2(energy) + totalRightShifts) =
+	// LOG_CONST * (log2_energy + totalRightShifts)
+	//
+	// We know by construction that energy is normalized to 15 bits.
+	// Hence, energy = 2^14 + frac_Q15, where frac_Q15 is a fractional part in Q15.
+	// Further, we'd like log2_energy in Q10
+	// log2(energy) in Q10 = 2^10 * log2(2^14 + frac_Q15) =
+	// 2^10 * log2(2^14 * (1 + frac_Q15 * 2^-14)) =
+	// 2^10 * (14 + log2(1 + frac_Q15 * 2^-14)) ~=
+	// (14 shl 10) + 2^10 * (frac_Q15 * 2^-14) =
+	// (14 shl 10) + (frac_Q15 * 2^-4) = (14 shl 10) + (frac_Q15 shr 4)
+	//
+	// Note that frac_Q15 = (energy & 0x00003FFF)
+
+	// Calculate and add the fractional part to log2Energy.
+	log2Energy += ((energy and 0x00003FFFu) shr 4).toInt()
+
+	// LOG_CONST is in Q9, log2_energy in Q10 and totalRightShifts in Q0.
+	// Note that in our derivation above, we have accounted for an output in Q4.
+	var logEnergy = ((LOG_CONST * log2Energy) shr 19) + ((totalRightShifts * LOG_CONST) shr 9)
+
+	if (logEnergy < 0) {
+		logEnergy = 0
+	}
+
+	logEnergy += offset
+
+	// Update the approximate totalEnergy with the energy of input, if totalEnergy has not exceeded
+	// MIN_ENERGY.
+	// totalEnergy is used as an energy indicator in getGmmProbability().
+	if (totalEnergy.toInt() <= MIN_ENERGY) {
+		if (totalRightShifts >= 0) {
+			// We know by construction that energy > MIN_ENERGY in Q0, so add an arbitrary value
+			// such that totalEnergy exceeds MIN_ENERGY.
+			totalEnergy.add(MIN_ENERGY + 1)
+		} else {
+			// By construction, energy is represented by 15 bits, hence any number of right shifted
+			// energy will fit in an Int.
+			// In addition, adding the value to totalEnergy is wrap around safe as long as
+			// MIN_ENERGY < 8192.
+			totalEnergy.add((energy shr -totalRightShifts).toInt()) // Q0.
+		}
+	}
+
+	return logEnergy
+}
diff --git a/rhubarb/src/main/kotlin/voice_activity_detection/VoiceActivityDetector.kt b/rhubarb/src/main/kotlin/voice_activity_detection/VoiceActivityDetector.kt
new file mode 100644
index 0000000..f72f94a
--- /dev/null
+++ b/rhubarb/src/main/kotlin/voice_activity_detection/VoiceActivityDetector.kt
@@ -0,0 +1,734 @@
+package voice_activity_detection
+
+import AudioBuffer
+import org.apache.commons.lang3.mutable.MutableInt
+
+private const val COMP_VAR = 22005
+private const val LOG2EXP = 5909 // log2(exp(1)) in Q12
+
+private val SPECTRUM_WEIGHT = intArrayOf(6, 8, 10, 12, 14, 16)
+	.apply { assert(size == CHANNEL_COUNT) }
+
+private const val NOISE_UPDATE_CONST: Int = 655 // Q15
+
+private const val SPEECH_UPDATE_CONST: Int = 6554 // Q15
+
+private const val BACK_ETA: Int = 154 // Q8
+
+/** Upper limit of mean value for speech model, Q7 */
+private val MAXIMUM_SPEECH = intArrayOf(11392, 11392, 11520, 11520, 11520, 11520)
+	.apply { assert(size == CHANNEL_COUNT) }
+
+/** Minimum difference between the two models, Q5 */
+private val MINIMUM_DIFFERENCE = intArrayOf(544, 544, 576, 576, 576, 576)
+	.apply { assert(size == CHANNEL_COUNT) }
+
+/** Minimum value for mean value */
+private val MINIMUM_MEAN = intArrayOf(640, 768)
+	.apply { assert(size == GAUSSIAN_COUNT) }
+
+/** Upper limit of mean value for noise model, Q7 */
+private val MAXIMUM_NOISE = intArrayOf(9216, 9088, 8960, 8832, 8704, 8576)
+	.apply { assert(size == CHANNEL_COUNT) }
+
+/** Weights for the two Gaussians for the six channels (noise) */
+private val NOISE_DATA_WEIGHTS = intArrayOf(34, 62, 72, 66, 53, 25, 94, 66, 56, 62, 75, 103)
+	.apply { assert(size == TABLE_SIZE) }
+
+/** Weights for the two Gaussians for the six channels (speech) */
+private val SPEECH_DATA_WEIGHTS = intArrayOf(48, 82, 45, 87, 50, 47, 80, 46, 83, 41, 78, 81)
+	.apply { assert(size == TABLE_SIZE) }
+
+/** Means for the two Gaussians for the six channels (noise) */
+private val NOISE_DATA_MEANS = intArrayOf(6738, 4892, 7065, 6715, 6771, 3369, 7646, 3863, 7820, 7266, 5020, 4362)
+	.apply { assert(size == TABLE_SIZE) }
+
+/** Means for the two Gaussians for the six channels (speech) */
+private val SPEECH_DATA_MEANS = intArrayOf(8306, 10085, 10078, 11823, 11843, 6309, 9473, 9571, 10879, 7581, 8180, 7483)
+	.apply { assert(size == TABLE_SIZE) }
+
+/** Stds for the two Gaussians for the six channels (noise) */
+private val NOISE_DATA_STDS = intArrayOf(378, 1064, 493, 582, 688, 593, 474, 697, 475, 688, 421, 455)
+	.apply { assert(size == TABLE_SIZE) }
+
+/** Stds for the two Gaussians for the six channels (speech) */
+private val SPEECH_DATA_STDS = intArrayOf(555, 505, 567, 524, 585, 1231, 509, 828, 492, 1540, 1079, 850)
+	.apply { assert(size == TABLE_SIZE) }
+
+/** Maximum number of counted speech (VAD = 1) frames in a row */
+private const val MAX_SPEECH_FRAMES = 6
+
+/** Minimum standard deviation for both speech and noise */
+private const val MIN_STD = 384
+
+/** Number of frequency bands (named channels) */
+private const val CHANNEL_COUNT = 6
+
+/** Number of Gaussians per channel in the GMM */
+private const val GAUSSIAN_COUNT = 2
+
+/**
+ * Size of a table containing one value per channel and Gaussian.
+ * Indexed as gaussian * CHANNEL_COUNT + channel.
+ */
+private const val TABLE_SIZE = CHANNEL_COUNT * GAUSSIAN_COUNT
+
+// Adjustment for division with two in splitFilter.
+private val SPLIT_FILTER_OFFSETS = intArrayOf(368, 368, 272, 176, 176, 176)
+
+private const val SMOOTHING_DOWN = 6553 // 0.2 in Q15.
+private const val SMOOTHING_UP = 32439 // 0.99 in Q15.
+
+/** Performs a safe integer division, returning [Int.MAX_VALUE] if [denominator] = 0. */
+private infix fun Int.safeDiv(denominator: Int) = if (denominator != 0) this / denominator else Int.MAX_VALUE
+
+private data class GaussianProbabilityResult(
+	/** (probability for input) = 1 / std * exp(-(input - mean)^2 / (2 * std^2)) */
+	val probability: Int,
+
+	/**
+	 * Input used when updating the model, Q11.
+	 * delta = (input - mean) / std^2.
+	 */
+	val delta: Int
+)
+
+/**
+ * Calculates the probability for [input], given that [input] comes from a normal distribution with
+ * mean [mean] and standard deviation [std].
+ *
+ * @param [input] Input sample in Q4.
+ * @param [mean] Mean input in the statistical model, Q7.
+ * @param [std] Standard deviation, Q7.
+ */
+private fun getGaussianProbability(input: Int, mean: Int, std: Int): GaussianProbabilityResult {
+	var tmp16: Int
+	var expValue = 0
+
+	// Calculate invStd = 1 / s, in Q10.
+	// 131072 = 1 in Q17, and (std shr 1) is for rounding instead of truncation.
+	// Q-domain: Q17 / Q7 = Q10
+	var tmp32 = 131072 + (std shr 1)
+	val invStd = tmp32 safeDiv std
+
+	// Calculate inv_std2 = 1 / s^2, in Q14
+	tmp16 = invStd shr 2 // Q10 -> Q8.
+	// Q-domain: (Q8 * Q8) shr 2 = Q14
+	val invStd2 = (tmp16 * tmp16) shr 2
+
+	tmp16 = input shl 3 // Q4 -> Q7
+	tmp16 -= mean // Q7 - Q7 = Q7
+
+	// To be used later, when updating noise/speech model.
+	// delta = (x - m) / s^2, in Q11.
+	// Q-domain: (Q14 * Q7) shr 10 = Q11
+	val delta = (invStd2 * tmp16) shr 10
+
+	// Calculate the exponent tmp32 = (x - m)^2 / (2 * s^2), in Q10.
+	// Replacing division by two with one shift.
+	// Q-domain: (Q11 * Q7) shr 8 = Q10.
+	tmp32 = (delta * tmp16) shr 9
+
+	// If the exponent is small enough to give a non-zero probability, we calculate
+	// exp_value ~= exp(-(x - m)^2 / (2 * s^2))
+	//           ~= exp2(-log2(exp(1)) * tmp32)
+	if (tmp32 < COMP_VAR) {
+		// Calculate tmp16 = log2(exp(1)) * tmp32, in Q10.
+		// Q-domain: (Q12 * Q10) shr 12 = Q10.
+		tmp16 = (LOG2EXP * tmp32) shr 12
+		tmp16 = -tmp16
+		expValue = 0x0400 or (tmp16 and 0x03FF)
+		tmp16 = tmp16 xor 0xFFFF
+		tmp16 = tmp16 shr 10
+		tmp16 += 1
+		// Get expValue = exp(-tmp32) in Q10.
+		expValue = expValue shr tmp16
+	}
+
+	// Calculate and return (1 / s) * exp(-(x - m)^2 / (2 * s^2)), in Q20.
+	// Q-domain: Q10 * Q10 = Q20.
+	val probability = invStd * expValue
+	return GaussianProbabilityResult(probability, delta)
+}
+
+/**
+ * Calculates the weighted average with regard to number of Gaussians.
+ * CAUTION: Modifies [data] by adding the specified offset to each element.
+ *
+ * @param[data] Data to average.
+ * @param[offset] An offset to add to each element of [data].
+ * @param[weights] Weights used for averaging.
+ * @return The weighted average.
+ */
+private fun getWeightedAverage(data: IntArray, channel: Int, offset: Int, weights: IntArray): Int {
+	var result = 0
+	for (k in 0 until GAUSSIAN_COUNT) {
+		val index = k * CHANNEL_COUNT + channel
+		data[index] += offset
+		result += data[index] * weights[index]
+	}
+	return result
+}
+
+/**
+ * The VAD operating modes in order of increasing aggressiveness.
+ * A more aggressive VAD is more restrictive in reporting speech. Put in other words, the
+ * probability of being speech when the VAD returns 1 is increased with increasing mode. As a
+ * consequence, the missed detection rate also goes up.
+ */
+enum class Aggressiveness {
+	Quality,
+	LowBitrate,
+	Aggressive,
+	VeryAggressive
+}
+
+class VoiceActivityDetector(aggressiveness: Aggressiveness = Aggressiveness.Quality) {
+	private var frameCount: Int = 0
+	private var overhang: Int = 0
+	private var speechFrameCount: Int = 0
+
+	// PDF parameters
+	private val noiseMeans = NOISE_DATA_MEANS.clone()
+	private val speechMeans = SPEECH_DATA_MEANS.clone()
+	private val noiseStds = NOISE_DATA_STDS.clone()
+	private val speechStds = SPEECH_DATA_STDS.clone()
+
+	private val ageVector = IntArray(16 * CHANNEL_COUNT) { 0 }
+	private val minimumValueVector = IntArray(16 * CHANNEL_COUNT) { 10000 }
+
+	// Splitting filter states
+	private val upperState = List(5) { MutableInt(0) }
+	private val lowerState = List(5) { MutableInt(0) }
+
+	// High pass filter states
+	private val highPassFilterState = IntArray(4) { 0 }
+
+	// Median value memory for findMinimum()
+	private val median = IntArray(CHANNEL_COUNT) { 1600 }
+
+	private data class ThresholdsRecord(
+		val overhangMax1: Int,
+		val overhangMax2: Int,
+		val localThreshold: Int,
+		val globalThreshold: Int
+	)
+
+	private val thresholds = when(aggressiveness) {
+		Aggressiveness.Quality -> ThresholdsRecord(8, 14, 24, 57)
+		Aggressiveness.LowBitrate -> ThresholdsRecord(8, 14, 37, 100)
+		Aggressiveness.Aggressive -> ThresholdsRecord(6, 9, 82, 285)
+		Aggressiveness.VeryAggressive -> ThresholdsRecord(6, 9, 94, 1100)
+	}
+
+	/**
+	 * Calculates the probabilities for both speech and background noise using Gaussian Mixture
+	 * Models (GMM). A hypothesis-test is performed to decide which type of signal is most probable.
+	 *
+	 * @param[features] Feature vector of length [CHANNEL_COUNT] = log10(energy in frequency band)
+	 * @param[totalEnergy] Total energy in audio frame.
+	 * @param[frameLength] Number of input samples.
+	 * @return VAD decision. 0: no active speech; 1-6: active speech
+	 */
+	private fun getGmmProbability(features: List<Int>, totalEnergy: Int, frameLength: Int): Int {
+		var vadFlag = 0
+		var tmp: Int
+		var tmp1: Int
+		var tmp2: Int
+		val deltaN = IntArray(TABLE_SIZE)
+		val deltaS = IntArray(TABLE_SIZE)
+		val ngprvec = IntArray(TABLE_SIZE) { 0 } // Conditional probability = 0.
+		val sgprvec = IntArray(TABLE_SIZE) { 0 } // Conditional probability = 0.
+		var sumLogLikelihoodRatios = 0
+		val noiseProbability = IntArray(GAUSSIAN_COUNT)
+		val speechProbability = IntArray(GAUSSIAN_COUNT)
+
+		assert(frameLength == 80)
+
+		if (totalEnergy > MIN_ENERGY) {
+			// The signal power of current frame is large enough for processing. The processing
+			// consists of two parts:
+			// 1) Calculating the likelihood of speech and thereby a VAD decision.
+			// 2) Updating the underlying model, w.r.t., the decision made.
+
+			// The detection scheme is an LRT with hypothesis
+			// H0: Noise
+			// H1: Speech
+			//
+			// We combine a global LRT with local tests, for each frequency sub-band, here named
+			// channel.
+			for (channel in 0 until CHANNEL_COUNT) {
+				// For each channel we model the probability with a GMM consisting of
+				// GAUSSIAN_COUNT, with different means and standard deviations depending
+				// on H0 or H1.
+				var h0Test = 0
+				var h1Test = 0
+				for (k in 0 until GAUSSIAN_COUNT) {
+					val gaussian = channel + k * CHANNEL_COUNT
+
+					// Probability under H0, that is, probability of frame being noise.
+					// Value given in Q27 = Q7 * Q20.
+					val pNoise = getGaussianProbability(features[channel], noiseMeans[gaussian], noiseStds[gaussian])
+					deltaN[gaussian] = pNoise.delta
+					noiseProbability[k] = NOISE_DATA_WEIGHTS[gaussian] * pNoise.probability
+					h0Test += noiseProbability[k] // Q27
+
+					// Probability under H1, that is, probability of frame being speech.
+					// Value given in Q27 = Q7 * Q20.
+					val pSpeech = getGaussianProbability(features[channel], speechMeans[gaussian], speechStds[gaussian])
+					speechProbability[k] = SPEECH_DATA_WEIGHTS[gaussian] * pSpeech.probability
+					deltaS[gaussian] = pSpeech.delta
+					h1Test += speechProbability[k] // Q27
+				}
+
+				// Calculate the log likelihood ratio: log2(Pr{X|H1} / Pr{X|H1}).
+				// Approximation:
+				// log2(Pr{X|H1} / Pr{X|H1}) = log2(Pr{X|H1}*2^Q) - log2(Pr{X|H1}*2^Q)
+				//                           = log2(h1_test) - log2(h0_test)
+				//                           = log2(2^(31-shifts_h1)*(1+b1))
+				//                             - log2(2^(31-shifts_h0)*(1+b0))
+				//                           = shifts_h0 - shifts_h1
+				//                             + log2(1+b1) - log2(1+b0)
+				//                          ~= shifts_h0 - shifts_h1
+				//
+				// Note that b0 and b1 are values less than 1, hence, 0 <= log2(1+b0) < 1.
+				// Further, b0 and b1 are independent and on the average the two terms cancel.
+				val shiftsH0 = if (h0Test != 0) normSigned(h0Test) else 31
+				val shiftsH1 = if (h1Test != 0) normSigned(h1Test) else 31
+				val logLikelihoodRatio = shiftsH0 - shiftsH1
+
+				// Update sumLogLikelihoodRatios with spectrum weighting.
+				// This is used for the global VAD decision.
+				sumLogLikelihoodRatios += logLikelihoodRatio * SPECTRUM_WEIGHT[channel]
+
+				// Local VAD decision.
+				if ((logLikelihoodRatio * 4) > thresholds.localThreshold) {
+					vadFlag = 1
+				}
+
+				// Calculate local noise probabilities used later when updating the GMM.
+				val h0 = h0Test shr 12 // Q15
+				if (h0 > 0) {
+					// High probability of noise. Assign conditional probabilities for each
+					// Gaussian in the GMM.
+					val tmp3 = (noiseProbability[0] and 0xFFFFF000u.toInt()) shl 2 // Q29
+					ngprvec[channel] = tmp3 safeDiv h0 // Q14
+					ngprvec[channel + CHANNEL_COUNT] = 16384 - ngprvec[channel]
+				} else {
+					// Low noise probability. Assign conditional probability 1 to the first
+					// Gaussian and 0 to the rest (which is already set at initialization).
+					ngprvec[channel] = 16384
+				}
+
+				// Calculate local speech probabilities used later when updating the GMM.
+				val h1 = h1Test shr 12 // Q15
+				if (h1 > 0) {
+					// High probability of speech. Assign conditional probabilities for each
+					// Gaussian in the GMM. Otherwise use the initialized values, i.e., 0.
+					val tmp3 = (speechProbability[0] and 0xFFFFF000u.toInt()) shl 2 // Q29
+					sgprvec[channel] = tmp3 safeDiv h1 // Q14
+					sgprvec[channel + CHANNEL_COUNT] = 16384 - sgprvec[channel]
+				}
+			}
+
+			// Make a global VAD decision.
+			vadFlag = vadFlag or (if (sumLogLikelihoodRatios >= thresholds.globalThreshold) 1 else 0)
+
+			// Update the model parameters.
+			var maxspe = 12800
+			for (channel in 0 until CHANNEL_COUNT) {
+				// Get minimum value in past which is used for long term correction in Q4.
+				val featureMinimum = findMinimum(features[channel], channel)
+
+				// Compute the "global" mean, that is the sum of the two means weighted.
+				var noiseGlobalMean = getWeightedAverage(noiseMeans, channel, 0, NOISE_DATA_WEIGHTS)
+				tmp1 = noiseGlobalMean shr 6 // Q8
+
+				for (k in 0 until GAUSSIAN_COUNT) {
+					val gaussian = channel + k * CHANNEL_COUNT
+
+					val nmk = noiseMeans[gaussian]
+					val smk = speechMeans[gaussian]
+					var nsk = noiseStds[gaussian]
+					var ssk = speechStds[gaussian]
+
+					// Update noise mean vector if the frame consists of noise only.
+					var nmk2 = nmk
+					if (vadFlag == 0) {
+						// deltaN = (x-mu)/sigma^2
+						// ngprvec[k] = noiseProbability[k] / (noiseProbability[0] + noiseProbability[1])
+
+						// (Q14 * Q11 shr 11) = Q14.
+						val delt = (ngprvec[gaussian] * deltaN[gaussian]) shr 11
+						// Q7 + (Q14 * Q15 shr 22) = Q7.
+						nmk2 = nmk + ((delt * NOISE_UPDATE_CONST) shr 22)
+					}
+
+					// Long term correction of the noise mean.
+					// Q8 - Q8 = Q8.
+					val ndelt = (featureMinimum shl 4) - tmp1
+					// Q7 + (Q8 * Q8) shr 9 = Q7.
+					var nmk3 = nmk2 + ((ndelt * BACK_ETA) shr 9)
+
+					// Control that the noise mean does not drift to much.
+					tmp = (k + 5) shl 7
+					if (nmk3 < tmp) {
+						nmk3 = tmp
+					}
+					tmp = (72 + k - channel) shl 7
+					if (nmk3 > tmp) {
+						nmk3 = tmp
+					}
+					noiseMeans[gaussian] = nmk3
+
+					if (vadFlag != 0) {
+						// Update speech mean vector:
+						// deltaS = (x-mu)/sigma^2
+						// sgprvec[k] = speechProbability[k] / (speechProbability[0] + speechProbability[1])
+
+						// (Q14 * Q11) shr 11 = Q14.
+						val delt = (sgprvec[gaussian] * deltaS[gaussian]) shr 11
+						// Q14 * Q15 shr 21 = Q8.
+						tmp = (delt * SPEECH_UPDATE_CONST) shr 21
+						// Q7 + (Q8 shr 1) = Q7. With rounding.
+						var smk2 = smk + ((tmp + 1) shr 1)
+
+						// Control that the speech mean does not drift to much.
+						val maxmu = maxspe + 640
+						if (smk2 < MINIMUM_MEAN[k]) {
+							smk2 = MINIMUM_MEAN[k]
+						}
+						if (smk2 > maxmu) {
+							smk2 = maxmu
+						}
+						speechMeans[gaussian] = smk2 // Q7.
+
+						// (Q7 shr 3) = Q4. With rounding.
+						tmp = (smk + 4) shr 3
+
+						tmp = features[channel] - tmp // Q4
+						// (Q11 * Q4 shr 3) = Q12.
+						var tmp4 = (deltaS[gaussian] * tmp) shr 3
+						var tmp5 = tmp4 - 4096
+						tmp = sgprvec[gaussian] shr 2
+						// (Q14 shr 2) * Q12 = Q24.
+						tmp4 = tmp * tmp5
+
+						tmp5 = tmp4 shr 4 // Q20
+
+						// 0.1 * Q20 / Q7 = Q13.
+						if (tmp5 > 0) {
+							tmp = tmp5 safeDiv (ssk * 10)
+						} else {
+							tmp = -tmp5 safeDiv (ssk * 10)
+							tmp = -tmp
+						}
+						// Divide by 4 giving an update factor of 0.025 (= 0.1 / 4).
+						// Note that division by 4 equals shift by 2, hence,
+						// (Q13 shr 8) = (Q13 shr 6) / 4 = Q7.
+						tmp += 128 // Rounding.
+						ssk += (tmp shr 8)
+						if (ssk < MIN_STD) {
+							ssk = MIN_STD
+						}
+						speechStds[gaussian] = ssk
+					} else {
+						// Update GMM variance vectors.
+						// deltaN * (features[channel] - nmk) - 1
+						// Q4 - (Q7 shr 3) = Q4.
+						tmp = features[channel] - (nmk shr 3)
+						// (Q11 * Q4 shr 3) = Q12.
+						var tmp5 = (deltaN[gaussian] * tmp) shr 3
+						tmp5 -= 4096
+
+						// (Q14 shr 2) * Q12 = Q24.
+						tmp = (ngprvec[gaussian] + 2) shr 2
+						val tmp4 = tmp * tmp5
+						// Q20  * approx 0.001 (2^-10=0.0009766), hence,
+						// (Q24 shr 14) = (Q24 shr 4) / 2^10 = Q20.
+						tmp5 = tmp4 shr 14
+
+						// Q20 / Q7 = Q13.
+						if (tmp5 > 0) {
+							tmp = tmp5 safeDiv nsk
+						} else {
+							tmp = -tmp5 safeDiv nsk
+							tmp = -tmp
+						}
+						tmp += 32 // Rounding
+						nsk += tmp shr 6 // Q13 shr 6 = Q7.
+						if (nsk < MIN_STD) {
+							nsk = MIN_STD
+						}
+						noiseStds[gaussian] = nsk
+					}
+				}
+
+				// Separate models if they are too close.
+				// noiseGlobalMean in Q14 (= Q7 * Q7).
+				noiseGlobalMean = getWeightedAverage(noiseMeans, channel, 0, NOISE_DATA_WEIGHTS)
+
+				// speechGlobalMean in Q14 (= Q7 * Q7).
+				var speechGlobalMean = getWeightedAverage(speechMeans, channel, 0, SPEECH_DATA_WEIGHTS)
+
+				// diff = "global" speech mean - "global" noise mean.
+				// (Q14 shr 9) - (Q14 shr 9) = Q5.
+				val diff = (speechGlobalMean shr 9) - (noiseGlobalMean shr 9)
+				if (diff < MINIMUM_DIFFERENCE[channel]) {
+					tmp = MINIMUM_DIFFERENCE[channel] - diff
+
+					// tmp1_s16 = ~0.8 * (MINIMUM_DIFFERENCE - diff) in Q7.
+					// tmp2_s16 = ~0.2 * (MINIMUM_DIFFERENCE - diff) in Q7.
+					tmp1 = (13 * tmp) shr 2
+					tmp2 = (3 * tmp) shr 2
+
+					// Move Gaussian means for speech model by tmp1 and update speechGlobalMean.
+					// Note that speechMeans[channel] is changed after the call.
+					speechGlobalMean = getWeightedAverage(speechMeans, channel, tmp1, SPEECH_DATA_WEIGHTS)
+
+					// Move Gaussian means for noise model by -tmp2 and update noiseGlobalMean.
+					// Note that noiseMeans[channel] is
+					// changed after the call.
+					noiseGlobalMean = getWeightedAverage(noiseMeans, channel, -tmp2, NOISE_DATA_WEIGHTS)
+				}
+
+				// Control that the speech & noise means do not drift to much.
+				maxspe = MAXIMUM_SPEECH[channel]
+				tmp2 = speechGlobalMean shr 7
+				if (tmp2 > maxspe) {
+					// Upper limit of speech model.
+					tmp2 -= maxspe
+
+					for (k in 0 until GAUSSIAN_COUNT) {
+						speechMeans[channel + k * CHANNEL_COUNT] -= tmp2
+					}
+				}
+
+				tmp2 = noiseGlobalMean shr 7
+				if (tmp2 > MAXIMUM_NOISE[channel]) {
+					tmp2 -= MAXIMUM_NOISE[channel]
+
+					for (k in 0 until GAUSSIAN_COUNT) {
+						noiseMeans[channel + k * CHANNEL_COUNT] -= tmp2
+					}
+				}
+			}
+			frameCount++
+		}
+
+		// Smooth with respect to transition hysteresis.
+		if (vadFlag == 0) {
+			if (overhang > 0) {
+				vadFlag = 2 + overhang
+				overhang--
+			}
+			speechFrameCount = 0
+		} else {
+			speechFrameCount++
+			if (speechFrameCount > MAX_SPEECH_FRAMES) {
+				speechFrameCount = MAX_SPEECH_FRAMES
+				overhang = thresholds.overhangMax2
+			} else {
+				overhang = thresholds.overhangMax1
+			}
+		}
+		return vadFlag
+	}
+
+	/**
+	 * Updates and returns the smoothed feature minimum. As minimum we use the median of the five
+	 * smallest feature values in a 100 frames long window.
+	 *
+	 * Inserts [featureValue] into [minimumValueVector], if it is one of the 16 smallest values the
+	 * last 100 frames. Then calculates and returns the median of the five smallest values.
+	 *
+	 * As long as [frameCount] is zero, that is, we haven't received any "valid" data, [findMinimum]
+	 * outputs the default value of 1600.
+	 *
+	 * @param[featureValue] New feature value to update with.
+	 * @param[channel] Channel number.
+	 * @return Smoothed minimum value for a moving window.
+	 */
+	private fun findMinimum(featureValue: Int, channel: Int): Int {
+		var position = -1
+		var currentMedian = 1600
+		var alpha = 0
+		var tmp: Int
+		val offset = channel * 16
+
+		// Accessor for the age of each value of the channel
+		val age = object {
+			operator fun get(i: Int) = ageVector[offset + i]
+			operator fun set(i: Int, value: Int) { ageVector[offset + i] = value }
+		}
+
+		// Accessor for the 16 minimum values of the channel
+		val smallestValues = object {
+			operator fun get(i: Int) = minimumValueVector[offset + i]
+			operator fun set(i: Int, value: Int) { minimumValueVector[offset + i] = value }
+		}
+
+		assert(channel < CHANNEL_COUNT)
+
+		// Each value in smallestValues is getting 1 loop older. Update age and remove old values.
+		for (i in 0 until 16) {
+			if (age[i] != 100) {
+				age[i]++
+			} else {
+				// Too old value. Remove from memory and shift larger values downwards.
+				for (j in i until 15) {
+					smallestValues[j] = smallestValues[j + 1]
+					age[j] = age[j + 1]
+				}
+				age[15] = 101
+				smallestValues[15] = 10000
+			}
+		}
+
+		// Check if featureValue is smaller than any of the values in smallest_values.
+		// If so, find the position where to insert the new value.
+		if (featureValue < smallestValues[7]) {
+			position = when {
+				featureValue < smallestValues[3] ->
+					when {
+						featureValue < smallestValues[1] -> if (featureValue < smallestValues[0]) 0 else 1
+						featureValue < smallestValues[2] -> 2
+						else -> 3
+					}
+				featureValue < smallestValues[5] -> if (featureValue < smallestValues[4]) 4 else 5
+				featureValue < smallestValues[6] -> 6
+				else -> 7
+			}
+		} else if (featureValue < smallestValues[15]) {
+			position = when {
+				featureValue < smallestValues[11] -> when {
+					featureValue < smallestValues[9] -> if (featureValue < smallestValues[8]) 8 else 9
+					featureValue < smallestValues[10] -> 10
+					else -> 11
+				}
+				featureValue < smallestValues[13] -> if (featureValue < smallestValues[12]) 12 else 13
+				featureValue < smallestValues[14] -> 14
+				else -> 15
+			}
+		}
+
+		// If we have detected a new small value, insert it at the correct position and shift larger
+		// values up.
+		if (position > -1) {
+			for (i in 15 downTo position + 1) {
+				smallestValues[i] = smallestValues[i - 1]
+				age[i] = age[i - 1]
+			}
+			smallestValues[position] = featureValue
+			age[position] = 1
+		}
+
+		// Get currentMedian
+		if (frameCount > 2) {
+			currentMedian = smallestValues[2]
+		} else if (frameCount > 0) {
+			currentMedian = smallestValues[0]
+		}
+
+		// Smooth the median value.
+		if (frameCount > 0) {
+			alpha = if (currentMedian < median[channel]) {
+				SMOOTHING_DOWN // 0.2 in Q15.
+			} else {
+				SMOOTHING_UP // 0.99 in Q15.
+			}
+		}
+		tmp = (alpha + 1) * median[channel]
+		tmp += (Short.MAX_VALUE - alpha) * currentMedian
+		tmp += 16384
+		median[channel] = tmp shr 15
+
+		return median[channel]
+	}
+
+	private data class FeatureResult(
+		// 10 * log10(energy in each frequency band), Q4
+		val features: List<Int>,
+		// Total energy of the signal
+		// NOTE: This value is not exact. It is only used in a comparison.
+		val totalEnergy: Int
+	)
+
+	/**
+	 * Takes an audio buffer and calculates the logarithm of the energy of each of the
+	 * [CHANNEL_COUNT] = 6 frequency bands used by the VAD:
+	 * 80-250 Hz, 250-500 Hz, 500-1000 Hz, 1000-2000 Hz, 2000-3000 Hz, 3000-4000 Hz.
+	 *
+	 * The values are given in Q4 and written to features. Further, an approximate overall energy is
+	 * returned. The return value is used in [getGmmProbability] as a signal indicator, hence it is
+	 * arbitrary above the threshold [MIN_ENERGY].
+	*/
+	private fun calculateFeatures(input: AudioBuffer): FeatureResult {
+		assert(input.size == 80)
+
+		// Split at 2000 Hz and downsample.
+		var frequencyBand = 0
+		val `0 to 4000 Hz` = input
+		val (`2000 to 4000 Hz`, `0 to 2000 Hz`) =
+			splitFilter(`0 to 4000 Hz`, upperState[frequencyBand], lowerState[frequencyBand])
+
+		// For the upper band (2000 to 4000 Hz) split at 3000 Hz and downsample.
+		frequencyBand = 1
+		val (`3000 to 4000 Hz`, `2000 to 3000 Hz`) =
+			splitFilter(`2000 to 4000 Hz`, upperState[frequencyBand], lowerState[frequencyBand])
+
+		// For the lower band (0 to 2000 Hz) split at 1000 Hz and downsample.
+		frequencyBand = 2
+		val (`1000 to 2000 Hz`, `0 to 1000 Hz`) =
+			splitFilter(`0 to 2000 Hz`, upperState[frequencyBand], lowerState[frequencyBand])
+
+		// For the lower band (0 to 1000 Hz) split at 500 Hz and downsample.
+		frequencyBand = 3
+		val (`500 to 1000 Hz`, `0 to 500 Hz`) =
+			splitFilter(`0 to 1000 Hz`, upperState[frequencyBand], lowerState[frequencyBand])
+
+		// For the lower band (0 t0 500 Hz) split at 250 Hz and downsample.
+		frequencyBand = 4
+		val (`250 to 500 Hz`, `0 to 250 Hz`) =
+			splitFilter(`0 to 500 Hz`, upperState[frequencyBand], lowerState[frequencyBand])
+
+		// Remove 0 to 80 Hz by high pass filtering the lower band.
+		val `80 to 250 Hz` = highPassFilter(`0 to 250 Hz`, highPassFilterState)
+
+		val totalEnergy = MutableInt(0)
+		val `energy in 3000 to 4000 Hz` = logOfEnergy(`3000 to 4000 Hz`, SPLIT_FILTER_OFFSETS[5], totalEnergy)
+		val `energy in 2000 to 3000 Hz` = logOfEnergy(`2000 to 3000 Hz`, SPLIT_FILTER_OFFSETS[4], totalEnergy)
+		val `energy in 1000 to 2000 Hz` = logOfEnergy(`1000 to 2000 Hz`, SPLIT_FILTER_OFFSETS[3], totalEnergy)
+		val `energy in 500 to 1000 Hz` = logOfEnergy(`500 to 1000 Hz`, SPLIT_FILTER_OFFSETS[2], totalEnergy)
+		val `energy in 250 to 500 Hz` = logOfEnergy(`250 to 500 Hz`, SPLIT_FILTER_OFFSETS[1], totalEnergy)
+		val `energy in 50 to 250 Hz` = logOfEnergy(`80 to 250 Hz`, SPLIT_FILTER_OFFSETS[0], totalEnergy)
+
+		val features = listOf(
+			`energy in 50 to 250 Hz`,
+			`energy in 250 to 500 Hz`,
+			`energy in 500 to 1000 Hz`,
+			`energy in 1000 to 2000 Hz`,
+			`energy in 2000 to 3000 Hz`,
+			`energy in 3000 to 4000 Hz`
+		)
+		assert(features.size == CHANNEL_COUNT)
+		return FeatureResult(features, totalEnergy.toInt())
+	}
+
+	/**
+	 * Calculates a VAD decision for the specified audio frame, which must be 100 ms long and
+	 * sampled at 8 kHz.
+	 *
+	 * @return If true, the frame contains voice activity.
+	 */
+	fun process(audioFrame: AudioBuffer): Boolean {
+		// Get power in the bands
+		val (features, totalEnergy) = calculateFeatures(audioFrame)
+
+		// Make a VAD
+		val vadResult = getGmmProbability(features, totalEnergy, audioFrame.size)
+
+		// return vad != 0
+		return vadResult == 1
+	}
+}