import kotlin.math.absoluteValue /////////////////////////////////////////////////////////////////////////////////////////////////////// // webrtc/common_audio/signal_processing/include/signal_processing_library.h // Macros specific for the fixed point implementation val WEBRTC_SPL_WORD16_MAX = 32767 /////////////////////////////////////////////////////////////////////////////////////////////////////// // webrtc/common_audio/signal_processing/include/spl_inl.h // webrtc/common_audio/signal_processing/spl_inl.c // Table used by CountLeadingZeros32_NotBuiltin. For each UInt n // that's a sequence of 0 bits followed by a sequence of 1 bits, the entry at // index (n * 0x8c0b2891) shr 26 in this table gives the number of zero bits in // n. val kCountLeadingZeros32_Table = intArrayOf( 32, 8, 17, -1, -1, 14, -1, -1, -1, 20, -1, -1, -1, 28, -1, 18, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 26, 25, 24, 4, 11, 23, 31, 3, 7, 10, 16, 22, 30, -1, -1, 2, 6, 13, 9, -1, 15, -1, 21, -1, 29, 19, -1, -1, -1, -1, -1, 1, 27, 5, 12 ).apply { assert(size == 64) } // Returns the number of leading zero bits in the argument. fun CountLeadingZeros32(n: UInt): Int { // Normalize n by rounding up to the nearest number that is a sequence of 0 // bits followed by a sequence of 1 bits. This number has the same number of // leading zeros as the original n. There are exactly 33 such values. var normalized = n or (n shr 1) normalized = normalized or (normalized shr 2) normalized = normalized or (normalized shr 4) normalized = normalized or (normalized shr 8) normalized = normalized or (normalized shr 16) // Multiply the modified n with a constant selected (by exhaustive search) // such that each of the 33 possible values of n give a product whose 6 most // significant bits are unique. Then look up the answer in the table. return kCountLeadingZeros32_Table[((normalized * 0x8c0b2891u) shr 26).toInt()] } // Return the number of steps a signed int can be left-shifted without overflow, // or 0 if a == 0. inline fun NormW32(a: Int): Int { return if (a == 0) 0 else CountLeadingZeros32((if (a < 0) a.inv() else a).toUInt()) - 1 } // Return the number of steps an unsigned int can be left-shifted without overflow, // or 0 if a == 0. inline fun NormU32(a: UInt): Int { return if (a == 0u) 0 else CountLeadingZeros32(a) } inline fun GetSizeInBits(n: UInt): Int { return 32 - CountLeadingZeros32(n) } /////////////////////////////////////////////////////////////////////////////////////////////////////// // webrtc/common_audio/signal_processing/get_scaling_square.c // // GetScalingSquare(...) // // Returns the # of bits required to scale the samples specified in the // |in_vector| parameter so that, if the squares of the samples are added the // # of times specified by the |times| parameter, the 32-bit addition will not // overflow (result in Int). // // Input: // - in_vector : Input vector to check scaling on // - in_vector_length : Samples in |in_vector| // - times : Number of additions to be performed // // Return value : Number of right bit shifts needed to avoid // overflow in the addition calculation fun GetScalingSquare(buffer: AudioBuffer, times: Int): Int { var maxAbsSample = -1 for (i in 0 until buffer.size) { val absSample = buffer[i].toInt().absoluteValue if (absSample > maxAbsSample) { maxAbsSample = absSample } } if (maxAbsSample == 0) { return 0 // Since norm(0) returns 0 } val t = NormW32(maxAbsSample * maxAbsSample) val bitCount = GetSizeInBits(times.toUInt()) return if (t > bitCount) 0 else bitCount - t } /////////////////////////////////////////////////////////////////////////////////////////////////////// // webrtc/common_audio/signal_processing/energy.c data class EnergyResult( // Number of left bit shifts needed to get the physical energy value, i.e, to get the Q0 value val scaleFactor: Int, // Energy value in Q(-|scale_factor|) val energy: Int ) // // Energy(...) // // Calculates the energy of a vector // // Input: // - vector : Vector which the energy should be calculated on // - vector_length : Number of samples in vector // // Output: // - scale_factor : Number of left bit shifts needed to get the physical // energy value, i.e, to get the Q0 value // // Return value : Energy value in Q(-|scale_factor|) // fun Energy(buffer: AudioBuffer): EnergyResult { val scaling = GetScalingSquare(buffer, buffer.size) var energy = 0 for (i in 0 until buffer.size) { energy += (buffer[i] * buffer[i]) shr scaling } return EnergyResult(scaling, energy) } /////////////////////////////////////////////////////////////////////////////////////////////////////// // webrtc/common_audio/signal_processing/division_operations.c // // DivW32W16(...) // // Divides a Int |num| by a Int |den|. // // If |den|==0, (Int)0x7FFFFFFF is returned. // // Input: // - num : Numerator // - den : Denominator // // Return value : Result of the division (as a Int), i.e., the // integer part of num/den. // fun DivW32W16(num: Int, den: Int) = if (den != 0) num / den else Int.MAX_VALUE /////////////////////////////////////////////////////////////////////////////////////////////////////// // webrtc/common_audio/vad/vad_gmm.c data class GaussianProbabilityResult( // (probability for |input|) = 1 / |std| * exp(-(|input| - |mean|)^2 / (2 * |std|^2)) val probability: Int, // Input used when updating the model, Q11. // |delta| = (|input| - |mean|) / |std|^2. val delta: Int ) val kCompVar = 22005; val kLog2Exp = 5909; // log2(exp(1)) in Q12. // Calculates the probability for |input|, given that |input| comes from a // normal distribution with mean and standard deviation (|mean|, |std|). // // Inputs: // - input : input sample in Q4. // - mean : mean input in the statistical model, Q7. // - std : standard deviation, Q7. // // Output: // // - delta : input used when updating the model, Q11. // |delta| = (|input| - |mean|) / |std|^2. // // Return: // (probability for |input|) = // 1 / |std| * exp(-(|input| - |mean|)^2 / (2 * |std|^2)); //--------------------------------------------------------------------------------- // For a normal distribution, the probability of |input| is calculated and // returned (in Q20). The formula for normal distributed probability is // // 1 / s * exp(-(x - m)^2 / (2 * s^2)) // // where the parameters are given in the following Q domains: // m = |mean| (Q7) // s = |std| (Q7) // x = |input| (Q4) // in addition to the probability we output |delta| (in Q11) used when updating // the noise/speech model. fun GaussianProbability(input: Int, mean: Int, std: Int): GaussianProbabilityResult { var tmp16 = 0 var inv_std = 0 var inv_std2 = 0 var exp_value = 0 var tmp32 = 0 // Calculate |inv_std| = 1 / s, in Q10. // 131072 = 1 in Q17, and (|std| shr 1) is for rounding instead of truncation. // Q-domain: Q17 / Q7 = Q10. tmp32 = 131072 + (std shr 1) inv_std = DivW32W16(tmp32, std) // Calculate |inv_std2| = 1 / s^2, in Q14. tmp16 = inv_std shr 2 // Q10 -> Q8. // Q-domain: (Q8 * Q8) shr 2 = Q14. inv_std2 = (tmp16 * tmp16) shr 2 tmp16 = input shl 3 // Q4 -> Q7 tmp16 -= mean // Q7 - Q7 = Q7 // To be used later, when updating noise/speech model. // |delta| = (x - m) / s^2, in Q11. // Q-domain: (Q14 * Q7) shr 10 = Q11. val delta = (inv_std2 * tmp16) shr 10 // Calculate the exponent |tmp32| = (x - m)^2 / (2 * s^2), in Q10. Replacing // division by two with one shift. // Q-domain: (Q11 * Q7) shr 8 = Q10. tmp32 = (delta * tmp16) shr 9 // If the exponent is small enough to give a non-zero probability we calculate // |exp_value| ~= exp(-(x - m)^2 / (2 * s^2)) // ~= exp2(-log2(exp(1)) * |tmp32|). if (tmp32 < kCompVar) { // Calculate |tmp16| = log2(exp(1)) * |tmp32|, in Q10. // Q-domain: (Q12 * Q10) shr 12 = Q10. tmp16 = (kLog2Exp * tmp32) shr 12 tmp16 = -tmp16 exp_value = 0x0400 or (tmp16 and 0x03FF) tmp16 = tmp16 xor 0xFFFF tmp16 = tmp16 shr 10 tmp16 += 1; // Get |exp_value| = exp(-|tmp32|) in Q10. exp_value = exp_value shr tmp16 } // Calculate and return (1 / s) * exp(-(x - m)^2 / (2 * s^2)), in Q20. // Q-domain: Q10 * Q10 = Q20. val probability = inv_std * exp_value return GaussianProbabilityResult(probability, delta) } /////////////////////////////////////////////////////////////////////////////////////////////////////// // webrtc/common_audio/vad/vad_core.c // Spectrum Weighting val kSpectrumWeight = intArrayOf(6, 8, 10, 12, 14, 16).apply { assert(size == kNumChannels) } val kNoiseUpdateConst: Int = 655 // Q15 val kSpeechUpdateConst: Int = 6554 // Q15 val kBackEta: Int = 154 // Q8 // Minimum difference between the two models, Q5 val kMinimumDifference = intArrayOf(544, 544, 576, 576, 576, 576).apply { assert(size == kNumChannels) } // Upper limit of mean value for speech model, Q7 val kMaximumSpeech = intArrayOf(11392, 11392, 11520, 11520, 11520, 11520).apply { assert(size == kNumChannels) } // Minimum value for mean value val kMinimumMean = intArrayOf(640, 768).apply { assert(size == kNumGaussians) } // Upper limit of mean value for noise model, Q7 val kMaximumNoise = intArrayOf(9216, 9088, 8960, 8832, 8704, 8576).apply { assert(size == kNumChannels) } // Start values for the Gaussian models, Q7 // Weights for the two Gaussians for the six channels (noise) val kNoiseDataWeights = intArrayOf(34, 62, 72, 66, 53, 25, 94, 66, 56, 62, 75, 103).apply { assert(size == kTableSize) } // Weights for the two Gaussians for the six channels (speech) val kSpeechDataWeights = intArrayOf(48, 82, 45, 87, 50, 47, 80, 46, 83, 41, 78, 81).apply { assert(size == kTableSize) } // Means for the two Gaussians for the six channels (noise) val kNoiseDataMeans = intArrayOf(6738, 4892, 7065, 6715, 6771, 3369, 7646, 3863, 7820, 7266, 5020, 4362).apply { assert(size == kTableSize) } // Means for the two Gaussians for the six channels (speech) val kSpeechDataMeans = intArrayOf(8306, 10085, 10078, 11823, 11843, 6309, 9473, 9571, 10879, 7581, 8180, 7483).apply { assert(size == kTableSize) } // Stds for the two Gaussians for the six channels (noise) val kNoiseDataStds = intArrayOf(378, 1064, 493, 582, 688, 593, 474, 697, 475, 688, 421, 455).apply { assert(size == kTableSize) } // Stds for the two Gaussians for the six channels (speech) val kSpeechDataStds = intArrayOf(555, 505, 567, 524, 585, 1231, 509, 828, 492, 1540, 1079, 850).apply { assert(size == kTableSize) } // Constants used in GmmProbability(). // // Maximum number of counted speech (VAD = 1) frames in a row. val kMaxSpeechFrames: Int = 6 // Minimum standard deviation for both speech and noise. val kMinStd: Int = 384 // Number of frequency bands (named channels) val kNumChannels = 6 // Number of Gaussians per channel in the GMM val kNumGaussians = 2 // Index = gaussian * kNumChannels + channel val kTableSize = kNumChannels * kNumGaussians // Minimum energy required to trigger audio signal val kMinEnergy = 10 /** * The VAD operating modes in order of increasing aggressiveness. * A more aggressive VAD is more restrictive in reporting speech. Put in other words, the * probability of being speech when the VAD returns 1 is increased with increasing mode. As a * consequence, the missed detection rate also goes up. */ enum class Aggressiveness { Quality, LowBitrate, Aggressive, VeryAggressive } class VadInstT(aggressiveness: Aggressiveness = Aggressiveness.Quality) { // General variables var vad: Int = 1 // Speech active (=1) // TODO(bjornv): Change to |frame_count|. var frame_counter: Int = 0 var over_hang: Int = 0 var num_of_speech: Int = 0 // PDF parameters var noise_means = kNoiseDataMeans.clone() var speech_means = kSpeechDataStds.clone() var noise_stds = kNoiseDataStds.clone() var speech_stds = kSpeechDataStds.clone() // Index vector // TODO(bjornv): Change to |age_vector|. var index_vector = IntArray(16 * kNumChannels) { 0 } // Minimum value vector var low_value_vector = IntArray(16 * kNumChannels) { 10000 } // Splitting filter states var upper_state = IntArray(5) { 0 } var lower_state = IntArray(5) { 0 } // High pass filter states var hp_filter_state = IntArray(4) { 0 } // Mean value memory for FindMinimum() // TODO(bjornv): Change to |median|. var mean_value = IntArray(kNumChannels) { 1600 } // Thresholds val over_hang_max_1: Int = when(aggressiveness) { Aggressiveness.Quality, Aggressiveness.LowBitrate -> 8 Aggressiveness.Aggressive, Aggressiveness.VeryAggressive -> 6 } val over_hang_max_2: Int = when(aggressiveness) { Aggressiveness.Quality, Aggressiveness.LowBitrate -> 14 Aggressiveness.Aggressive, Aggressiveness.VeryAggressive -> 9 } // TODO: Rename to localThreshold? val individual: Int = when(aggressiveness) { Aggressiveness.Quality -> 24 Aggressiveness.LowBitrate -> 37 Aggressiveness.Aggressive -> 82 Aggressiveness.VeryAggressive -> 94 } // TODO: Rename to globalThreshold? val total: Int = when(aggressiveness) { Aggressiveness.Quality -> 57 Aggressiveness.LowBitrate -> 100 Aggressiveness.Aggressive -> 285 Aggressiveness.VeryAggressive -> 1100 } } typealias WebRtcVadInst = VadInstT // Calculates the weighted average w.r.t. number of Gaussians. The |data| are // updated with an |offset| before averaging. // // - data [i/o] : Data to average. // - offset [i] : An offset added to |data|. // - weights [i] : Weights used for averaging. // // returns : The weighted average. fun WeightedAverage(data: IntArray, channel: Int, offset: Int, weights: IntArray): Int { var weighted_average = 0 for (k in 0 until kNumGaussians) { val index = k * kNumChannels + channel data[index] += offset weighted_average += data[index] * weights[index] } return weighted_average } // Calculates the probabilities for both speech and background noise using // Gaussian Mixture Models (GMM). A hypothesis-test is performed to decide which // type of signal is most probable. // // - self [i/o] : Pointer to VAD instance // - features [i] : Feature vector of length |kNumChannels| // = log10(energy in frequency band) // - total_power [i] : Total power in audio frame. // - frame_length [i] : Number of input samples // // - returns : the VAD decision (0 - noise, 1 - speech). fun GmmProbability(self: VadInstT, features: IntArray, total_power: Int, frame_length: Int): Int { var vadflag = 0; var tmp_s16: Int var tmp1_s16: Int var tmp2_s16: Int val deltaN = IntArray(kTableSize) val deltaS = IntArray(kTableSize) val ngprvec = IntArray(kTableSize) { 0 } // Conditional probability = 0. val sgprvec = IntArray(kTableSize) { 0 } // Conditional probability = 0. var sum_log_likelihood_ratios = 0 val noise_probability = IntArray(kNumGaussians) val speech_probability = IntArray(kNumGaussians) assert(frame_length == 80); if (total_power > kMinEnergy) { // The signal power of current frame is large enough for processing. The // processing consists of two parts: // 1) Calculating the likelihood of speech and thereby a VAD decision. // 2) Updating the underlying model, w.r.t., the decision made. // The detection scheme is an LRT with hypothesis // H0: Noise // H1: Speech // // We combine a global LRT with local tests, for each frequency sub-band, // here defined as |channel|. for (channel in 0 until kNumChannels) { // For each channel we model the probability with a GMM consisting of // |kNumGaussians|, with different means and standard deviations depending // on H0 or H1. var h0_test = 0 var h1_test = 0 for (k in 0 until kNumGaussians) { val gaussian = channel + k * kNumChannels; // Probability under H0, that is, probability of frame being noise. // Value given in Q27 = Q7 * Q20. val pNoise = GaussianProbability(features[channel], self.noise_means[gaussian], self.noise_stds[gaussian]) deltaN[gaussian] = pNoise.delta noise_probability[k] = kNoiseDataWeights[gaussian] * pNoise.probability h0_test += noise_probability[k]; // Q27 // Probability under H1, that is, probability of frame being speech. // Value given in Q27 = Q7 * Q20. val pSpeech = GaussianProbability(features[channel], self.speech_means[gaussian], self.speech_stds[gaussian]) speech_probability[k] = kSpeechDataWeights[gaussian] * pSpeech.probability deltaS[gaussian] = pSpeech.delta h1_test += speech_probability[k]; // Q27 } // Calculate the log likelihood ratio: log2(Pr{X|H1} / Pr{X|H1}). // Approximation: // log2(Pr{X|H1} / Pr{X|H1}) = log2(Pr{X|H1}*2^Q) - log2(Pr{X|H1}*2^Q) // = log2(h1_test) - log2(h0_test) // = log2(2^(31-shifts_h1)*(1+b1)) // - log2(2^(31-shifts_h0)*(1+b0)) // = shifts_h0 - shifts_h1 // + log2(1+b1) - log2(1+b0) // ~= shifts_h0 - shifts_h1 // // Note that b0 and b1 are values less than 1, hence, 0 <= log2(1+b0) < 1. // Further, b0 and b1 are independent and on the average the two terms cancel. val shifts_h0 = if (h0_test != 0) NormW32(h0_test) else 31 val shifts_h1 = if (h1_test != 0) NormW32(h1_test) else 31 val log_likelihood_ratio = shifts_h0 - shifts_h1; // Update |sum_log_likelihood_ratios| with spectrum weighting. This is // used for the global VAD decision. sum_log_likelihood_ratios += log_likelihood_ratio * kSpectrumWeight[channel] // Local VAD decision. if ((log_likelihood_ratio * 4) > self.individual) { vadflag = 1 } // TODO(bjornv): The conditional probabilities below are applied on the // hard coded number of Gaussians set to two. Find a way to generalize. // Calculate local noise probabilities used later when updating the GMM. val h0 = h0_test shr 12 // Q15 if (h0 > 0) { // High probability of noise. Assign conditional probabilities for each // Gaussian in the GMM. val tmp = (noise_probability[0] and 0xFFFFF000u.toInt()) shl 2 // Q29 ngprvec[channel] = DivW32W16(tmp, h0); // Q14 ngprvec[channel + kNumChannels] = 16384 - ngprvec[channel]; } else { // Low noise probability. Assign conditional probability 1 to the first // Gaussian and 0 to the rest (which is already set at initialization). ngprvec[channel] = 16384; } // Calculate local speech probabilities used later when updating the GMM. val h1 = (h1_test shr 12); // Q15 if (h1 > 0) { // High probability of speech. Assign conditional probabilities for each // Gaussian in the GMM. Otherwise use the initialized values, i.e., 0. val tmp = (speech_probability[0] and 0xFFFFF000u.toInt()) shl 2; // Q29 sgprvec[channel] = DivW32W16(tmp, h1) // Q14 sgprvec[channel + kNumChannels] = 16384 - sgprvec[channel]; } } // Make a global VAD decision. vadflag = vadflag or (if (sum_log_likelihood_ratios >= self.total) 1 else 0) // Update the model parameters. var maxspe = 12800 for (channel in 0 until kNumChannels) { // Get minimum value in past which is used for long term correction in Q4. val feature_minimum = FindMinimum(self, features[channel], channel) // Compute the "global" mean, that is the sum of the two means weighted. var noise_global_mean = WeightedAverage(self.noise_means, channel, 0, kNoiseDataWeights) tmp1_s16 = noise_global_mean shr 6 // Q8 for (k in 0 until kNumGaussians) { val gaussian = channel + k * kNumChannels val nmk = self.noise_means[gaussian]; val smk = self.speech_means[gaussian]; var nsk = self.noise_stds[gaussian]; var ssk = self.speech_stds[gaussian]; // Update noise mean vector if the frame consists of noise only. var nmk2 = nmk; if (vadflag == 0) { // deltaN = (x-mu)/sigma^2 // ngprvec[k] = |noise_probability[k]| / // (|noise_probability[0]| + |noise_probability[1]|) // (Q14 * Q11 shr 11) = Q14. val delt = (ngprvec[gaussian] * deltaN[gaussian]) shr 11 // Q7 + (Q14 * Q15 shr 22) = Q7. nmk2 = nmk + (delt * kNoiseUpdateConst) shr 22 } // Long term correction of the noise mean. // Q8 - Q8 = Q8. val ndelt = (feature_minimum shl 4) - tmp1_s16; // Q7 + (Q8 * Q8) shr 9 = Q7. var nmk3 = nmk2 + ((ndelt * kBackEta) shr 9) // Control that the noise mean does not drift to much. tmp_s16 = (k + 5) shl 7 if (nmk3 < tmp_s16) { nmk3 = tmp_s16 } tmp_s16 = (72 + k - channel) shl 7 if (nmk3 > tmp_s16) { nmk3 = tmp_s16 } self.noise_means[gaussian] = nmk3; if (vadflag != 0) { // Update speech mean vector: // |deltaS| = (x-mu)/sigma^2 // sgprvec[k] = |speech_probability[k]| / // (|speech_probability[0]| + |speech_probability[1]|) // (Q14 * Q11) shr 11 = Q14. val delt = (sgprvec[gaussian] * deltaS[gaussian]) shr 11 // Q14 * Q15 shr 21 = Q8. tmp_s16 = (delt * kSpeechUpdateConst) shr 21 // Q7 + (Q8 shr 1) = Q7. With rounding. var smk2 = smk + ((tmp_s16 + 1) shr 1); // Control that the speech mean does not drift to much. val maxmu = maxspe + 640; if (smk2 < kMinimumMean[k]) { smk2 = kMinimumMean[k]; } if (smk2 > maxmu) { smk2 = maxmu; } self.speech_means[gaussian] = smk2; // Q7. // (Q7 shr 3) = Q4. With rounding. tmp_s16 = ((smk + 4) shr 3); tmp_s16 = features[channel] - tmp_s16; // Q4 // (Q11 * Q4 shr 3) = Q12. var tmp1_s32 = (deltaS[gaussian] * tmp_s16) shr 3; var tmp2_s32 = tmp1_s32 - 4096; tmp_s16 = sgprvec[gaussian] shr 2; // (Q14 shr 2) * Q12 = Q24. tmp1_s32 = tmp_s16 * tmp2_s32; tmp2_s32 = tmp1_s32 shr 4; // Q20 // 0.1 * Q20 / Q7 = Q13. if (tmp2_s32 > 0) { tmp_s16 = DivW32W16(tmp2_s32, ssk * 10); } else { tmp_s16 = DivW32W16(-tmp2_s32, ssk * 10); tmp_s16 = -tmp_s16; } // Divide by 4 giving an update factor of 0.025 (= 0.1 / 4). // Note that division by 4 equals shift by 2, hence, // (Q13 shr 8) = (Q13 shr 6) / 4 = Q7. tmp_s16 += 128; // Rounding. ssk += (tmp_s16 shr 8); if (ssk < kMinStd) { ssk = kMinStd; } self.speech_stds[gaussian] = ssk; } else { // Update GMM variance vectors. // deltaN * (features[channel] - nmk) - 1 // Q4 - (Q7 shr 3) = Q4. tmp_s16 = features[channel] - (nmk shr 3); // (Q11 * Q4 shr 3) = Q12. var tmp1_s32 = (deltaN[gaussian] * tmp_s16) shr 3; tmp1_s32 -= 4096; // (Q14 shr 2) * Q12 = Q24. tmp_s16 = (ngprvec[gaussian] + 2) shr 2; val tmp2_s32 = tmp_s16 * tmp1_s32; // Q20 * approx 0.001 (2^-10=0.0009766), hence, // (Q24 shr 14) = (Q24 shr 4) / 2^10 = Q20. tmp1_s32 = tmp2_s32 shr 14; // Q20 / Q7 = Q13. if (tmp1_s32 > 0) { tmp_s16 = DivW32W16(tmp1_s32, nsk); } else { tmp_s16 = DivW32W16(-tmp1_s32, nsk); tmp_s16 = -tmp_s16; } tmp_s16 += 32; // Rounding nsk += tmp_s16 shr 6; // Q13 shr 6 = Q7. if (nsk < kMinStd) { nsk = kMinStd; } self.noise_stds[gaussian] = nsk; } } // Separate models if they are too close. // |noise_global_mean| in Q14 (= Q7 * Q7). noise_global_mean = WeightedAverage(self.noise_means, channel, 0, kNoiseDataWeights) // |speech_global_mean| in Q14 (= Q7 * Q7). var speech_global_mean = WeightedAverage(self.speech_means, channel, 0, kSpeechDataWeights) // |diff| = "global" speech mean - "global" noise mean. // (Q14 shr 9) - (Q14 shr 9) = Q5. val diff = (speech_global_mean shr 9) - (noise_global_mean shr 9); if (diff < kMinimumDifference[channel]) { tmp_s16 = kMinimumDifference[channel] - diff; // |tmp1_s16| = ~0.8 * (kMinimumDifference - diff) in Q7. // |tmp2_s16| = ~0.2 * (kMinimumDifference - diff) in Q7. tmp1_s16 = (13 * tmp_s16) shr 2 tmp2_s16 = (3 * tmp_s16) shr 2 // Move Gaussian means for speech model by |tmp1_s16| and update // |speech_global_mean|. Note that |self.speech_means[channel]| is // changed after the call. speech_global_mean = WeightedAverage(self.speech_means, channel, tmp1_s16, kSpeechDataWeights) // Move Gaussian means for noise model by -|tmp2_s16| and update // |noise_global_mean|. Note that |self.noise_means[channel]| is // changed after the call. noise_global_mean = WeightedAverage(self.noise_means, channel, -tmp2_s16, kNoiseDataWeights) } // Control that the speech & noise means do not drift to much. maxspe = kMaximumSpeech[channel]; tmp2_s16 = speech_global_mean shr 7 if (tmp2_s16 > maxspe) { // Upper limit of speech model. tmp2_s16 -= maxspe; for (k in 0 until kNumGaussians) { self.speech_means[channel + k * kNumChannels] -= tmp2_s16; } } tmp2_s16 = noise_global_mean shr 7 if (tmp2_s16 > kMaximumNoise[channel]) { tmp2_s16 -= kMaximumNoise[channel]; for (k in 0 until kNumGaussians) { self.noise_means[channel + k * kNumChannels] -= tmp2_s16; } } } self.frame_counter++; } // Smooth with respect to transition hysteresis. if (vadflag == 0) { if (self.over_hang > 0) { vadflag = 2 + self.over_hang; self.over_hang--; } self.num_of_speech = 0; } else { self.num_of_speech++; if (self.num_of_speech > kMaxSpeechFrames) { self.num_of_speech = kMaxSpeechFrames; self.over_hang = self.over_hang_max_2; } else { self.over_hang = self.over_hang_max_1; } } return vadflag; } /////////////////////////////////////////////////////////////////////////////////////////////////////// // webrtc/common_audio/vad/vad_sp.c val kSmoothingDown = 6553; // 0.2 in Q15. val kSmoothingUp = 32439; // 0.99 in Q15. // Updates and returns the smoothed feature minimum. As minimum we use the // median of the five smallest feature values in a 100 frames long window. // As long as |handle->frame_counter| is zero, that is, we haven't received any // "valid" data, FindMinimum() outputs the default value of 1600. // // Inputs: // - feature_value : New feature value to update with. // - channel : Channel number. // // Input & Output: // - handle : State information of the VAD. // // Returns: // : Smoothed minimum value for a moving window. // Inserts |feature_value| into |low_value_vector|, if it is one of the 16 // smallest values the last 100 frames. Then calculates and returns the median // of the five smallest values. fun FindMinimum(self: VadInstT, feature_value: Int, channel: Int): Int { var i = 0 var j = 0 var position = -1 // Offset to beginning of the 16 minimum values in memory. val offset = channel shl 4 var current_median = 1600; var alpha = 0; var tmp32 = 0; // Pointer to memory for the 16 minimum values and the age of each value of // the |channel|. Int * age = &self.index_vector[offset]; Int * smallest_values = &self.low_value_vector[offset]; assert(channel < kNumChannels); // Each value in |smallest_values| is getting 1 loop older. Update |age|, and // remove old values. for (i = 0; i < 16; i++) { if (age[i] != 100) { age[i]++; } else { // Too old value. Remove from memory and shift larger values downwards. for (j = i; j < 16; j++) { smallest_values[j] = smallest_values[j + 1]; age[j] = age[j + 1]; } age[15] = 101; smallest_values[15] = 10000; } } // Check if |feature_value| is smaller than any of the values in // |smallest_values|. If so, find the |position| where to insert the new value // (|feature_value|). if (feature_value < smallest_values[7]) { if (feature_value < smallest_values[3]) { if (feature_value < smallest_values[1]) { if (feature_value < smallest_values[0]) { position = 0; } else { position = 1; } } else if (feature_value < smallest_values[2]) { position = 2; } else { position = 3; } } else if (feature_value < smallest_values[5]) { if (feature_value < smallest_values[4]) { position = 4; } else { position = 5; } } else if (feature_value < smallest_values[6]) { position = 6; } else { position = 7; } } else if (feature_value < smallest_values[15]) { if (feature_value < smallest_values[11]) { if (feature_value < smallest_values[9]) { if (feature_value < smallest_values[8]) { position = 8; } else { position = 9; } } else if (feature_value < smallest_values[10]) { position = 10; } else { position = 11; } } else if (feature_value < smallest_values[13]) { if (feature_value < smallest_values[12]) { position = 12; } else { position = 13; } } else if (feature_value < smallest_values[14]) { position = 14; } else { position = 15; } } // If we have detected a new small value, insert it at the correct position // and shift larger values up. if (position > -1) { for (i = 15; i > position; i--) { smallest_values[i] = smallest_values[i - 1]; age[i] = age[i - 1]; } smallest_values[position] = feature_value; age[position] = 1; } // Get |current_median|. if (self.frame_counter > 2) { current_median = smallest_values[2]; } else if (self.frame_counter > 0) { current_median = smallest_values[0]; } // Smooth the median value. if (self.frame_counter > 0) { if (current_median < self.mean_value[channel]) { alpha = kSmoothingDown; // 0.2 in Q15. } else { alpha = kSmoothingUp; // 0.99 in Q15. } } tmp32 = (alpha + 1) * self.mean_value[channel]; tmp32 += (WEBRTC_SPL_WORD16_MAX - alpha) * current_median; tmp32 += 16384; self.mean_value[channel] = (Int)(tmp32 shr 15); return self.mean_value[channel]; } /////////////////////////////////////////////////////////////////////////////////////////////////////// // webrtc/common_audio/vad/vad_filterbank.c // Constants used in LogOfEnergy(). val Int kLogConst = 24660; // 160*log10(2) in Q9. val Int kLogEnergyIntPart = 14336; // 14 in Q10 // Coefficients used by HighPassFilter, Q14. val Int kHpZeroCoefs[3] = { 6631, -13262, 6631 }; val Int kHpPoleCoefs[3] = { 16384, -7756, 5620 }; // Allpass filter coefficients, upper and lower, in Q15. // Upper: 0.64, Lower: 0.17 val Int kAllPassCoefsQ15[2] = { 20972, 5571 }; // Adjustment for division with two in SplitFilter. val Int kOffsetVector[6] = { 368, 368, 272, 176, 176, 176 }; // High pass filtering, with a cut-off frequency at 80 Hz, if the |data_in| is // sampled at 500 Hz. // // - data_in [i] : Input audio data sampled at 500 Hz. // - data_length [i] : Length of input and output data. // - filter_state [i/o] : State of the filter. // - data_out [o] : Output audio data in the frequency interval // 80 - 250 Hz. void HighPassFilter(val Sample* data_in, Int data_length, Int* filter_state, Sample* data_out) { Int i; val Sample* in_ptr = data_in; Sample* out_ptr = data_out; Int tmp32 = 0; // The sum of the absolute values of the impulse response: // The zero/pole-filter has a max amplification of a single sample of: 1.4546 // Impulse response: 0.4047 -0.6179 -0.0266 0.1993 0.1035 -0.0194 // The all-zero section has a max amplification of a single sample of: 1.6189 // Impulse response: 0.4047 -0.8094 0.4047 0 0 0 // The all-pole section has a max amplification of a single sample of: 1.9931 // Impulse response: 1.0000 0.4734 -0.1189 -0.2187 -0.0627 0.04532 for (i = 0; i < data_length; i++) { // All-zero section (filter coefficients in Q14). tmp32 = kHpZeroCoefs[0] * *in_ptr; tmp32 += kHpZeroCoefs[1] * filter_state[0]; tmp32 += kHpZeroCoefs[2] * filter_state[1]; filter_state[1] = filter_state[0]; filter_state[0] = *in_ptr++; // All-pole section (filter coefficients in Q14). tmp32 -= kHpPoleCoefs[1] * filter_state[2]; tmp32 -= kHpPoleCoefs[2] * filter_state[3]; filter_state[3] = filter_state[2]; filter_state[2] = (Int)(tmp32 shr 14); *out_ptr++ = filter_state[2]; } } // All pass filtering of |data_in|, used before splitting the signal into two // frequency bands (low pass vs high pass). // Note that |data_in| and |data_out| can NOT correspond to the same address. // // - data_in [i] : Input audio signal given in Q0. // - data_length [i] : Length of input and output data. // - filter_coefficient [i] : Given in Q15. // - filter_state [i/o] : State of the filter given in Q(-1). // - data_out [o] : Output audio signal given in Q(-1). void AllPassFilter(val Sample* data_in, Int data_length, Int filter_coefficient, Int* filter_state, Sample* data_out) { // The filter can only cause overflow (in the w16 output variable) // if more than 4 consecutive input numbers are of maximum value and // has the the same sign as the impulse responses first taps. // First 6 taps of the impulse response: // 0.6399 0.5905 -0.3779 0.2418 -0.1547 0.0990 Int i; Int tmp16 = 0; Int tmp32 = 0; Int state32 = ((Int)(*filter_state) * (1 shl 16)); // Q15 for (i = 0; i < data_length; i++) { tmp32 = state32 + filter_coefficient * *data_in; tmp16 = (Int)(tmp32 shr 16); // Q(-1) *data_out++ = tmp16; state32 = (*data_in * (1 shl 14)) - filter_coefficient * tmp16; // Q14 state32 *= 2; // Q15. data_in += 2; } *filter_state = (Int)(state32 shr 16); // Q(-1) } // Splits |data_in| into |hp_data_out| and |lp_data_out| corresponding to // an upper (high pass) part and a lower (low pass) part respectively. // // - data_in [i] : Input audio data to be split into two frequency bands. // - data_length [i] : Length of |data_in|. // - upper_state [i/o] : State of the upper filter, given in Q(-1). // - lower_state [i/o] : State of the lower filter, given in Q(-1). // - hp_data_out [o] : Output audio data of the upper half of the spectrum. // The length is |data_length| / 2. // - lp_data_out [o] : Output audio data of the lower half of the spectrum. // The length is |data_length| / 2. void SplitFilter(val Sample* data_in, Int data_length, Int* upper_state, Int* lower_state, Sample* hp_data_out, Sample* lp_data_out) { Int i; Int half_length = data_length shr 1; // Downsampling by 2. Int tmp_out; // All-pass filtering upper branch. AllPassFilter(&data_in[0], half_length, kAllPassCoefsQ15[0], upper_state, hp_data_out); // All-pass filtering lower branch. AllPassFilter(&data_in[1], half_length, kAllPassCoefsQ15[1], lower_state, lp_data_out); // Make LP and HP signals. for (i = 0; i < half_length; i++) { tmp_out = *hp_data_out; *hp_data_out++ -= *lp_data_out; *lp_data_out++ += tmp_out; } } // Calculates the energy of |data_in| in dB, and also updates an overall // |total_energy| if necessary. // // - data_in [i] : Input audio data for energy calculation. // - data_length [i] : Length of input data. // - offset [i] : Offset value added to |log_energy|. // - total_energy [i/o] : An external energy updated with the energy of // |data_in|. // NOTE: |total_energy| is only updated if // |total_energy| <= |kMinEnergy|. // - log_energy [o] : 10 * log10("energy of |data_in|") given in Q4. void LogOfEnergy(val Sample* data_in, Int data_length, Int offset, Int* total_energy, Int* log_energy) { // |tot_rshifts| accumulates the number of right shifts performed on |energy|. Int tot_rshifts = 0; // The |energy| will be normalized to 15 bits. We use unsigned integer because // we eventually will mask out the fractional part. UInt energy = 0; assert(data_in != NULL); assert(data_length > 0); energy = (UInt)Energy((Int*)data_in, data_length, &tot_rshifts); if (energy != 0) { // By construction, normalizing to 15 bits is equivalent with 17 leading // zeros of an unsigned 32 bit value. Int normalizing_rshifts = 17 - NormU32(energy); // In a 15 bit representation the leading bit is 2^14. log2(2^14) in Q10 is // (14 shl 10), which is what we initialize |log2_energy| with. For a more // detailed derivations, see below. Int log2_energy = kLogEnergyIntPart; tot_rshifts += normalizing_rshifts; // Normalize |energy| to 15 bits. // |tot_rshifts| is now the total number of right shifts performed on // |energy| after normalization. This means that |energy| is in // Q(-tot_rshifts). if (normalizing_rshifts < 0) { energy shl= -normalizing_rshifts; } else { energy shr= normalizing_rshifts; } // Calculate the energy of |data_in| in dB, in Q4. // // 10 * log10("true energy") in Q4 = 2^4 * 10 * log10("true energy") = // 160 * log10(|energy| * 2^|tot_rshifts|) = // 160 * log10(2) * log2(|energy| * 2^|tot_rshifts|) = // 160 * log10(2) * (log2(|energy|) + log2(2^|tot_rshifts|)) = // (160 * log10(2)) * (log2(|energy|) + |tot_rshifts|) = // |kLogConst| * (|log2_energy| + |tot_rshifts|) // // We know by construction that |energy| is normalized to 15 bits. Hence, // |energy| = 2^14 + frac_Q15, where frac_Q15 is a fractional part in Q15. // Further, we'd like |log2_energy| in Q10 // log2(|energy|) in Q10 = 2^10 * log2(2^14 + frac_Q15) = // 2^10 * log2(2^14 * (1 + frac_Q15 * 2^-14)) = // 2^10 * (14 + log2(1 + frac_Q15 * 2^-14)) ~= // (14 shl 10) + 2^10 * (frac_Q15 * 2^-14) = // (14 shl 10) + (frac_Q15 * 2^-4) = (14 shl 10) + (frac_Q15 shr 4) // // Note that frac_Q15 = (|energy| & 0x00003FFF) // Calculate and add the fractional part to |log2_energy|. log2_energy += (Int)((energy & 0x00003FFF) shr 4); // |kLogConst| is in Q9, |log2_energy| in Q10 and |tot_rshifts| in Q0. // Note that we in our derivation above have accounted for an output in Q4. *log_energy = (Int)(((kLogConst * log2_energy) shr 19) + ((tot_rshifts * kLogConst) shr 9)); if (*log_energy < 0) { *log_energy = 0; } } else { *log_energy = offset; return; } *log_energy += offset; // Update the approximate |total_energy| with the energy of |data_in|, if // |total_energy| has not exceeded |kMinEnergy|. |total_energy| is used as an // energy indicator in GmmProbability() in vad_core.c. if (*total_energy <= kMinEnergy) { if (tot_rshifts >= 0) { // We know by construction that the |energy| > |kMinEnergy| in Q0, so add // an arbitrary value such that |total_energy| exceeds |kMinEnergy|. *total_energy += kMinEnergy + 1; } else { // By construction |energy| is represented by 15 bits, hence any number of // right shifted |energy| will fit in an Int. In addition, adding the // value to |total_energy| is wrap around safe as long as // |kMinEnergy| < 8192. *total_energy += (Int)(energy shr -tot_rshifts); // Q0. } } } // Takes |data_length| samples of |data_in| and calculates the logarithm of the // energy of each of the |kNumChannels| = 6 frequency bands used by the VAD: // 80 Hz - 250 Hz // 250 Hz - 500 Hz // 500 Hz - 1000 Hz // 1000 Hz - 2000 Hz // 2000 Hz - 3000 Hz // 3000 Hz - 4000 Hz // // The values are given in Q4 and written to |features|. Further, an approximate // overall energy is returned. The return value is used in // GmmProbability() as a signal indicator, hence it is arbitrary above // the threshold |kMinEnergy|. // // - self [i/o] : State information of the VAD. // - data_in [i] : Input audio data, for feature extraction. // - data_length [i] : Audio data size, in number of samples. // - features [o] : 10 * log10(energy in each frequency band), Q4. // - returns : Total energy of the signal (NOTE! This value is not // exact. It is only used in a comparison.) Int CalculateFeatures(VadInstT* self, val Sample* data_in, Int data_length, Int* features) { Int total_energy = 0; // We expect |data_length| to be 80, 160 or 240 samples, which corresponds to // 10, 20 or 30 ms in 8 kHz. Therefore, the intermediate downsampled data will // have at most 120 samples after the first split and at most 60 samples after // the second split. Sample hp_120[120], lp_120[120]; Sample hp_60[60], lp_60[60]; val Int half_data_length = data_length shr 1; Int length = half_data_length; // |data_length| / 2, corresponds to // bandwidth = 2000 Hz after downsampling. // Initialize variables for the first SplitFilter(). Int frequency_band = 0; val Sample* in_ptr = data_in; // [0 - 4000] Hz. Sample* hp_out_ptr = hp_120; // [2000 - 4000] Hz. Sample* lp_out_ptr = lp_120; // [0 - 2000] Hz. assert(data_length <= 240); assert(4 < kNumChannels - 1); // Checking maximum |frequency_band|. // Split at 2000 Hz and downsample. SplitFilter(in_ptr, data_length, &self.upper_state[frequency_band], &self.lower_state[frequency_band], hp_out_ptr, lp_out_ptr); // For the upper band (2000 Hz - 4000 Hz) split at 3000 Hz and downsample. frequency_band = 1; in_ptr = hp_120; // [2000 - 4000] Hz. hp_out_ptr = hp_60; // [3000 - 4000] Hz. lp_out_ptr = lp_60; // [2000 - 3000] Hz. SplitFilter(in_ptr, length, &self.upper_state[frequency_band], &self.lower_state[frequency_band], hp_out_ptr, lp_out_ptr); // Energy in 3000 Hz - 4000 Hz. length shr= 1; // |data_length| / 4 <=> bandwidth = 1000 Hz. LogOfEnergy(hp_60, length, kOffsetVector[5], &total_energy, &features[5]); // Energy in 2000 Hz - 3000 Hz. LogOfEnergy(lp_60, length, kOffsetVector[4], &total_energy, &features[4]); // For the lower band (0 Hz - 2000 Hz) split at 1000 Hz and downsample. frequency_band = 2; in_ptr = lp_120; // [0 - 2000] Hz. hp_out_ptr = hp_60; // [1000 - 2000] Hz. lp_out_ptr = lp_60; // [0 - 1000] Hz. length = half_data_length; // |data_length| / 2 <=> bandwidth = 2000 Hz. SplitFilter(in_ptr, length, &self.upper_state[frequency_band], &self.lower_state[frequency_band], hp_out_ptr, lp_out_ptr); // Energy in 1000 Hz - 2000 Hz. length shr= 1; // |data_length| / 4 <=> bandwidth = 1000 Hz. LogOfEnergy(hp_60, length, kOffsetVector[3], &total_energy, &features[3]); // For the lower band (0 Hz - 1000 Hz) split at 500 Hz and downsample. frequency_band = 3; in_ptr = lp_60; // [0 - 1000] Hz. hp_out_ptr = hp_120; // [500 - 1000] Hz. lp_out_ptr = lp_120; // [0 - 500] Hz. SplitFilter(in_ptr, length, &self.upper_state[frequency_band], &self.lower_state[frequency_band], hp_out_ptr, lp_out_ptr); // Energy in 500 Hz - 1000 Hz. length shr= 1; // |data_length| / 8 <=> bandwidth = 500 Hz. LogOfEnergy(hp_120, length, kOffsetVector[2], &total_energy, &features[2]); // For the lower band (0 Hz - 500 Hz) split at 250 Hz and downsample. frequency_band = 4; in_ptr = lp_120; // [0 - 500] Hz. hp_out_ptr = hp_60; // [250 - 500] Hz. lp_out_ptr = lp_60; // [0 - 250] Hz. SplitFilter(in_ptr, length, &self.upper_state[frequency_band], &self.lower_state[frequency_band], hp_out_ptr, lp_out_ptr); // Energy in 250 Hz - 500 Hz. length shr= 1; // |data_length| / 16 <=> bandwidth = 250 Hz. LogOfEnergy(hp_60, length, kOffsetVector[1], &total_energy, &features[1]); // Remove 0 Hz - 80 Hz, by high pass filtering the lower band. HighPassFilter(lp_60, length, self.hp_filter_state, hp_120); // Energy in 80 Hz - 250 Hz. LogOfEnergy(hp_120, length, kOffsetVector[0], &total_energy, &features[0]); return total_energy; } /////////////////////////////////////////////////////////////////////////////////////////////////////// // webrtc/common_audio/vad/webrtc_vad.c // This function was moved from vad_core.c. /**************************************************************************** * CalcVad48khz(...) * CalcVad32khz(...) * CalcVad16khz(...) * CalcVad8khz(...) * * Calculate probability for active speech and make VAD decision. * * Input: * - inst : Instance that should be initialized * - speech_frame : Input speech frame * - frame_length : Number of input samples * * Output: * - inst : Updated filter states etc. * * Return value : VAD decision * 0 - No active speech * 1-6 - Active speech */ Int CalcVad8khz(VadInstT* inst, val Sample* speech_frame, Int frame_length) { Int feature_vector[kNumChannels], total_power; // Get power in the bands total_power = CalculateFeatures(inst, speech_frame, frame_length, feature_vector); // Make a VAD inst->vad = GmmProbability(inst, feature_vector, total_power, frame_length); return inst->vad; } // Calculates a VAD decision for the |audio_frame|. For valid sampling rates // frame lengths, see the description of ValidRatesAndFrameLengths(). // // - handle [i/o] : VAD Instance. Needs to be initialized by // InitVadInst() before call. // - fs [i] : Sampling frequency (Hz): 8000, 16000, or 32000 // - audio_frame [i] : Audio frame buffer. // - frame_length [i] : Length of audio frame buffer in number of samples. // // returns : 1 - (Active Voice), // 0 - (Non-active Voice), // -1 - (Error) Int ProcessVad(VadInst* handle, Int fs, val Sample* audio_frame, Int frame_length) { Int vad = -1; VadInstT* self = (VadInstT*)handle; if (handle == NULL) { return -1; } if (self.init_flag != kInitCheck) { return -1; } if (audio_frame == NULL) { return -1; } if (fs != 8000) return -1; vad = CalcVad8khz(self, audio_frame, frame_length); if (vad > 0) { vad = 1; } return vad; }