Transform to idiomatic Kotlin
This commit is contained in:
parent
1ab80332ae
commit
273ba16bf8
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,298 @@
|
||||||
|
package voice_activity_detection
|
||||||
|
|
||||||
|
import AudioBuffer
|
||||||
|
import SampleArray
|
||||||
|
import org.apache.commons.lang3.mutable.MutableInt
|
||||||
|
import kotlin.math.absoluteValue
|
||||||
|
|
||||||
|
/** Minimum energy required to trigger audio signal */
|
||||||
|
const val MIN_ENERGY = 10
|
||||||
|
|
||||||
|
private const val LOG_CONST = 24660 // 160*log10(2) in Q9.
|
||||||
|
private const val LOG_ENERGY_INT_PART = 14336 // 14 in Q10
|
||||||
|
|
||||||
|
private val HP_ZERO_COEFS = intArrayOf(6631, -13262, 6631)
|
||||||
|
private val HP_POLE_COEFS = intArrayOf(16384, -7756, 5620)
|
||||||
|
|
||||||
|
private const val UPPER_ALL_PASS_COEFS_Q15 = 20972 // 0.64
|
||||||
|
private const val LOWER_ALL_PASS_COEFS_Q15 = 5571 // 0.17
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Table used by getLeadingZeroCount.
|
||||||
|
* For each UInt n that's a sequence of 0 bits followed by a sequence of 1 bits, the entry at index
|
||||||
|
* (n * 0x8c0b2891) shr 26 in this table gives the number of zero bits in n.
|
||||||
|
*/
|
||||||
|
private val LEADING_ZEROS_TABLE = intArrayOf(
|
||||||
|
32, 8, 17, -1, -1, 14, -1, -1, -1, 20, -1, -1, -1, 28, -1, 18,
|
||||||
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 26, 25, 24,
|
||||||
|
4, 11, 23, 31, 3, 7, 10, 16, 22, 30, -1, -1, 2, 6, 13, 9,
|
||||||
|
-1, 15, -1, 21, -1, 29, 19, -1, -1, -1, -1, -1, 1, 27, 5, 12
|
||||||
|
).apply { assert(size == 64) }
|
||||||
|
|
||||||
|
/** Returns the number of leading zero bits in the argument. */
|
||||||
|
fun getLeadingZeroCount(n: UInt): Int {
|
||||||
|
// Normalize n by rounding up to the nearest number that is a sequence of 0 bits followed by a
|
||||||
|
// sequence of 1 bits. This number has the same number of leading zeros as the original n.
|
||||||
|
// There are exactly 33 such values.
|
||||||
|
var normalized = n
|
||||||
|
normalized = normalized or (normalized shr 1)
|
||||||
|
normalized = normalized or (normalized shr 2)
|
||||||
|
normalized = normalized or (normalized shr 4)
|
||||||
|
normalized = normalized or (normalized shr 8)
|
||||||
|
normalized = normalized or (normalized shr 16)
|
||||||
|
|
||||||
|
// Multiply the modified n with a constant selected (by exhaustive search) such that each of the
|
||||||
|
// 33 possible values of n give a product whose 6 most significant bits are unique.
|
||||||
|
// Then look up the answer in the table.
|
||||||
|
return LEADING_ZEROS_TABLE[((normalized * 0x8c0b2891u) shr 26).toInt()]
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the number of bits by which a signed int can be left-shifted without overflow, or 0 if
|
||||||
|
* a == 0.
|
||||||
|
*/
|
||||||
|
fun normSigned(a: Int): Int =
|
||||||
|
if (a == 0)
|
||||||
|
0
|
||||||
|
else
|
||||||
|
getLeadingZeroCount((if (a < 0) a.inv() else a).toUInt()) - 1
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the number of bits by which an unsigned int can be left-shifted without overflow, or 0 if
|
||||||
|
* a == 0.
|
||||||
|
*/
|
||||||
|
fun normUnsigned(a: UInt): Int = if (a == 0u) 0 else getLeadingZeroCount(a)
|
||||||
|
|
||||||
|
/** Returns the number of bits needed to represent the specified value. */
|
||||||
|
fun getBitCount(n: UInt): Int = 32 - getLeadingZeroCount(n)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the number of right bit shifts that must be applied to each of the given samples so that,
|
||||||
|
* if the squares of the samples are added [times] times, the signed 32-bit addition will not
|
||||||
|
* overflow.
|
||||||
|
*/
|
||||||
|
fun getScalingSquare(buffer: AudioBuffer, times: Int): Int {
|
||||||
|
var maxAbsSample = -1
|
||||||
|
for (i in 0 until buffer.size) {
|
||||||
|
val absSample = buffer[i].toInt().absoluteValue
|
||||||
|
if (absSample > maxAbsSample) {
|
||||||
|
maxAbsSample = absSample
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (maxAbsSample == 0) {
|
||||||
|
return 0 // Since norm(0) returns 0
|
||||||
|
}
|
||||||
|
|
||||||
|
val t = normSigned(maxAbsSample * maxAbsSample)
|
||||||
|
val bitCount = getBitCount(times.toUInt())
|
||||||
|
return if (t > bitCount) 0 else bitCount - t
|
||||||
|
}
|
||||||
|
|
||||||
|
data class EnergyResult(
|
||||||
|
/**
|
||||||
|
* The number of left bit shifts needed to get the physical energy value, i.e, to get the Q0
|
||||||
|
* value
|
||||||
|
*/
|
||||||
|
val rightShifts: Int,
|
||||||
|
|
||||||
|
/** The energy value in Q(-[rightShifts]) */
|
||||||
|
val energy: Int
|
||||||
|
)
|
||||||
|
|
||||||
|
/** Calculates the energy of an audio buffer. */
|
||||||
|
fun getEnergy(buffer: AudioBuffer): EnergyResult {
|
||||||
|
val scaling = getScalingSquare(buffer, buffer.size)
|
||||||
|
|
||||||
|
var energy = 0
|
||||||
|
for (i in 0 until buffer.size) {
|
||||||
|
energy += (buffer[i] * buffer[i]) shr scaling
|
||||||
|
}
|
||||||
|
|
||||||
|
return EnergyResult(scaling, energy)
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Performs high pass filtering with a cut-off frequency at 80 Hz, if [input] is sampled at 500 Hz.
|
||||||
|
* @return Output audio data in the frequency interval 80 to 250 Hz.
|
||||||
|
*/
|
||||||
|
fun highPassFilter(input: AudioBuffer, filterState: IntArray): AudioBuffer {
|
||||||
|
// The sum of the absolute values of the impulse response:
|
||||||
|
// The zero/pole-filter has a max amplification of a single sample of: 1.4546
|
||||||
|
// Impulse response: 0.4047 -0.6179 -0.0266 0.1993 0.1035 -0.0194
|
||||||
|
// The all-zero section has a max amplification of a single sample of: 1.6189
|
||||||
|
// Impulse response: 0.4047 -0.8094 0.4047 0 0 0
|
||||||
|
// The all-pole section has a max amplification of a single sample of: 1.9931
|
||||||
|
// Impulse response: 1.0000 0.4734 -0.1189 -0.2187 -0.0627 0.04532
|
||||||
|
|
||||||
|
val result = SampleArray(input.size)
|
||||||
|
for (i in 0 until input.size) {
|
||||||
|
// All-zero section (filter coefficients in Q14).
|
||||||
|
var tmp32 = HP_ZERO_COEFS[0] * input[i]
|
||||||
|
tmp32 += HP_ZERO_COEFS[1] * filterState[0]
|
||||||
|
tmp32 += HP_ZERO_COEFS[2] * filterState[1]
|
||||||
|
filterState[1] = filterState[0]
|
||||||
|
filterState[0] = input[i].toInt()
|
||||||
|
|
||||||
|
// All-pole section (filter coefficients in Q14).
|
||||||
|
tmp32 -= HP_POLE_COEFS[1] * filterState[2]
|
||||||
|
tmp32 -= HP_POLE_COEFS[2] * filterState[3]
|
||||||
|
filterState[3] = filterState[2]
|
||||||
|
filterState[2] = tmp32 shr 14
|
||||||
|
result[i] = filterState[2].toShort()
|
||||||
|
}
|
||||||
|
|
||||||
|
return AudioBuffer(result)
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Performs all pass filtering, used before splitting the signal into two frequency bands (low pass
|
||||||
|
* vs high pass).
|
||||||
|
* @param[filterCoefficient] Given in Q15.
|
||||||
|
* @param[filterState] State of the filter given in Q(-1).
|
||||||
|
* @return Output audio signal given in Q(-1).
|
||||||
|
*/
|
||||||
|
fun allPassFilter(input: AudioBuffer, filterCoefficient: Int, filterState: MutableInt): AudioBuffer {
|
||||||
|
// The filter can only cause overflow (in the w16 output variable) if more than 4 consecutive
|
||||||
|
// input numbers are of maximum value andhas the the same sign as the impulse responses first
|
||||||
|
// taps.
|
||||||
|
// First 6 taps of the impulse response:
|
||||||
|
// 0.6399 0.5905 -0.3779 0.2418 -0.1547 0.0990
|
||||||
|
|
||||||
|
val result = SampleArray((input.size + 1) / 2)
|
||||||
|
var state32 = filterState.toInt() * (1 shl 16) // Q15
|
||||||
|
for (i in 0 until input.size step 2) {
|
||||||
|
val tmp32 = state32 + filterCoefficient * input[i]
|
||||||
|
val tmp16 = tmp32 shr 16 // Q(-1)
|
||||||
|
result[i / 2] = tmp16.toShort()
|
||||||
|
state32 = input[i] * (1 shl 14) - filterCoefficient * tmp16 // Q14
|
||||||
|
state32 *= 2 // Q15.
|
||||||
|
}
|
||||||
|
filterState.setValue(state32 shr 16) // Q(-1)
|
||||||
|
|
||||||
|
return AudioBuffer(result)
|
||||||
|
}
|
||||||
|
|
||||||
|
data class SplitFilterResult(
|
||||||
|
val highPassData: AudioBuffer,
|
||||||
|
val lowPassData: AudioBuffer
|
||||||
|
)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Splits audio data into an upper (high pass) part and a lower (low pass) part.
|
||||||
|
* @param[upperState] State of the upper filter, given in Q(-1).
|
||||||
|
* @param[lowerState] State of the lower filter, given in Q(-1).
|
||||||
|
*/
|
||||||
|
fun splitFilter(input: AudioBuffer, upperState: MutableInt, lowerState: MutableInt): SplitFilterResult {
|
||||||
|
val resultSize = input.size / 2 // Downsampling by 2
|
||||||
|
|
||||||
|
// All-pass filtering upper branch.
|
||||||
|
val tempHighPass = allPassFilter(input, UPPER_ALL_PASS_COEFS_Q15, upperState)
|
||||||
|
assert(tempHighPass.size == resultSize)
|
||||||
|
|
||||||
|
// All-pass filtering lower branch.
|
||||||
|
val tempLowPass = allPassFilter(AudioBuffer(input, 1), LOWER_ALL_PASS_COEFS_Q15, lowerState)
|
||||||
|
assert(tempLowPass.size == resultSize)
|
||||||
|
|
||||||
|
// Make LP and HP signals.
|
||||||
|
val highPassData = SampleArray(resultSize)
|
||||||
|
val lowPassData = SampleArray(resultSize)
|
||||||
|
for (i in 0 until resultSize) {
|
||||||
|
highPassData[i] = (tempHighPass[i] - tempLowPass[i]).toShort()
|
||||||
|
lowPassData[i] = (tempLowPass[i] + tempHighPass[i]).toShort()
|
||||||
|
}
|
||||||
|
|
||||||
|
return SplitFilterResult(AudioBuffer(highPassData), AudioBuffer(lowPassData))
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculates the energy of the input signal in dB, and also updates an overall [totalEnergy] if
|
||||||
|
* necessary.
|
||||||
|
* @param[offset] Offset value added to result.
|
||||||
|
* @param[totalEnergy] An external energy updated with the energy of the input signal.
|
||||||
|
* NOTE: [totalEnergy] is only updated if [totalEnergy] <= [MIN_ENERGY].
|
||||||
|
* @return 10 * log10(energy of input signal) given in Q4.
|
||||||
|
*/
|
||||||
|
fun logOfEnergy(input: AudioBuffer, offset: Int, totalEnergy: MutableInt): Int {
|
||||||
|
assert(input.size > 0)
|
||||||
|
|
||||||
|
val energyResult = getEnergy(input)
|
||||||
|
// totalRightShifts accumulates the number of right shifts performed on energy.
|
||||||
|
var totalRightShifts = energyResult.rightShifts
|
||||||
|
// energy will be normalized to 15 bits. We use unsigned integer because we eventually will mask
|
||||||
|
// out the fractional part.
|
||||||
|
var energy = energyResult.energy.toUInt()
|
||||||
|
|
||||||
|
if (energy == 0u) {
|
||||||
|
return offset
|
||||||
|
}
|
||||||
|
|
||||||
|
// By construction, normalizing to 15 bits is equivalent with 17 leading zeros of an unsigned 32
|
||||||
|
// bit value.
|
||||||
|
val normalizingRightShifts = 17 - normUnsigned(energy)
|
||||||
|
// In a 15 bit representation the leading bit is 2^14. log2(2^14) in Q10 is
|
||||||
|
// (14 shl 10), which is what we initialize log2Energy with. For a more detailed derivations,
|
||||||
|
// see below.
|
||||||
|
var log2Energy = LOG_ENERGY_INT_PART
|
||||||
|
|
||||||
|
totalRightShifts += normalizingRightShifts
|
||||||
|
// Normalize energy to 15 bits.
|
||||||
|
// totalRightShifts is now the total number of right shifts performed on energy after
|
||||||
|
// normalization. This means that energy is in Q(-totalRightShifts).
|
||||||
|
energy = if (normalizingRightShifts < 0)
|
||||||
|
energy shl -normalizingRightShifts
|
||||||
|
else
|
||||||
|
energy shr normalizingRightShifts
|
||||||
|
|
||||||
|
// Calculate the energy ofinput in dB, in Q4.
|
||||||
|
//
|
||||||
|
// 10 * log10("true energy") in Q4 = 2^4 * 10 * log10("true energy") =
|
||||||
|
// 160 * log10(energy * 2^totalRightShifts) =
|
||||||
|
// 160 * log10(2) * log2(energy * 2^totalRightShifts) =
|
||||||
|
// 160 * log10(2) * (log2(energy) + log2(2^totalRightShifts)) =
|
||||||
|
// (160 * log10(2)) * (log2(energy) + totalRightShifts) =
|
||||||
|
// LOG_CONST * (log2_energy + totalRightShifts)
|
||||||
|
//
|
||||||
|
// We know by construction that energy is normalized to 15 bits.
|
||||||
|
// Hence, energy = 2^14 + frac_Q15, where frac_Q15 is a fractional part in Q15.
|
||||||
|
// Further, we'd like log2_energy in Q10
|
||||||
|
// log2(energy) in Q10 = 2^10 * log2(2^14 + frac_Q15) =
|
||||||
|
// 2^10 * log2(2^14 * (1 + frac_Q15 * 2^-14)) =
|
||||||
|
// 2^10 * (14 + log2(1 + frac_Q15 * 2^-14)) ~=
|
||||||
|
// (14 shl 10) + 2^10 * (frac_Q15 * 2^-14) =
|
||||||
|
// (14 shl 10) + (frac_Q15 * 2^-4) = (14 shl 10) + (frac_Q15 shr 4)
|
||||||
|
//
|
||||||
|
// Note that frac_Q15 = (energy & 0x00003FFF)
|
||||||
|
|
||||||
|
// Calculate and add the fractional part to log2Energy.
|
||||||
|
log2Energy += ((energy and 0x00003FFFu) shr 4).toInt()
|
||||||
|
|
||||||
|
// LOG_CONST is in Q9, log2_energy in Q10 and totalRightShifts in Q0.
|
||||||
|
// Note that in our derivation above, we have accounted for an output in Q4.
|
||||||
|
var logEnergy = ((LOG_CONST * log2Energy) shr 19) + ((totalRightShifts * LOG_CONST) shr 9)
|
||||||
|
|
||||||
|
if (logEnergy < 0) {
|
||||||
|
logEnergy = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
logEnergy += offset
|
||||||
|
|
||||||
|
// Update the approximate totalEnergy with the energy of input, if totalEnergy has not exceeded
|
||||||
|
// MIN_ENERGY.
|
||||||
|
// totalEnergy is used as an energy indicator in getGmmProbability().
|
||||||
|
if (totalEnergy.toInt() <= MIN_ENERGY) {
|
||||||
|
if (totalRightShifts >= 0) {
|
||||||
|
// We know by construction that energy > MIN_ENERGY in Q0, so add an arbitrary value
|
||||||
|
// such that totalEnergy exceeds MIN_ENERGY.
|
||||||
|
totalEnergy.add(MIN_ENERGY + 1)
|
||||||
|
} else {
|
||||||
|
// By construction, energy is represented by 15 bits, hence any number of right shifted
|
||||||
|
// energy will fit in an Int.
|
||||||
|
// In addition, adding the value to totalEnergy is wrap around safe as long as
|
||||||
|
// MIN_ENERGY < 8192.
|
||||||
|
totalEnergy.add((energy shr -totalRightShifts).toInt()) // Q0.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return logEnergy
|
||||||
|
}
|
|
@ -0,0 +1,734 @@
|
||||||
|
package voice_activity_detection
|
||||||
|
|
||||||
|
import AudioBuffer
|
||||||
|
import org.apache.commons.lang3.mutable.MutableInt
|
||||||
|
|
||||||
|
private const val COMP_VAR = 22005
|
||||||
|
private const val LOG2EXP = 5909 // log2(exp(1)) in Q12
|
||||||
|
|
||||||
|
private val SPECTRUM_WEIGHT = intArrayOf(6, 8, 10, 12, 14, 16)
|
||||||
|
.apply { assert(size == CHANNEL_COUNT) }
|
||||||
|
|
||||||
|
private const val NOISE_UPDATE_CONST: Int = 655 // Q15
|
||||||
|
|
||||||
|
private const val SPEECH_UPDATE_CONST: Int = 6554 // Q15
|
||||||
|
|
||||||
|
private const val BACK_ETA: Int = 154 // Q8
|
||||||
|
|
||||||
|
/** Upper limit of mean value for speech model, Q7 */
|
||||||
|
private val MAXIMUM_SPEECH = intArrayOf(11392, 11392, 11520, 11520, 11520, 11520)
|
||||||
|
.apply { assert(size == CHANNEL_COUNT) }
|
||||||
|
|
||||||
|
/** Minimum difference between the two models, Q5 */
|
||||||
|
private val MINIMUM_DIFFERENCE = intArrayOf(544, 544, 576, 576, 576, 576)
|
||||||
|
.apply { assert(size == CHANNEL_COUNT) }
|
||||||
|
|
||||||
|
/** Minimum value for mean value */
|
||||||
|
private val MINIMUM_MEAN = intArrayOf(640, 768)
|
||||||
|
.apply { assert(size == GAUSSIAN_COUNT) }
|
||||||
|
|
||||||
|
/** Upper limit of mean value for noise model, Q7 */
|
||||||
|
private val MAXIMUM_NOISE = intArrayOf(9216, 9088, 8960, 8832, 8704, 8576)
|
||||||
|
.apply { assert(size == CHANNEL_COUNT) }
|
||||||
|
|
||||||
|
/** Weights for the two Gaussians for the six channels (noise) */
|
||||||
|
private val NOISE_DATA_WEIGHTS = intArrayOf(34, 62, 72, 66, 53, 25, 94, 66, 56, 62, 75, 103)
|
||||||
|
.apply { assert(size == TABLE_SIZE) }
|
||||||
|
|
||||||
|
/** Weights for the two Gaussians for the six channels (speech) */
|
||||||
|
private val SPEECH_DATA_WEIGHTS = intArrayOf(48, 82, 45, 87, 50, 47, 80, 46, 83, 41, 78, 81)
|
||||||
|
.apply { assert(size == TABLE_SIZE) }
|
||||||
|
|
||||||
|
/** Means for the two Gaussians for the six channels (noise) */
|
||||||
|
private val NOISE_DATA_MEANS = intArrayOf(6738, 4892, 7065, 6715, 6771, 3369, 7646, 3863, 7820, 7266, 5020, 4362)
|
||||||
|
.apply { assert(size == TABLE_SIZE) }
|
||||||
|
|
||||||
|
/** Means for the two Gaussians for the six channels (speech) */
|
||||||
|
private val SPEECH_DATA_MEANS = intArrayOf(8306, 10085, 10078, 11823, 11843, 6309, 9473, 9571, 10879, 7581, 8180, 7483)
|
||||||
|
.apply { assert(size == TABLE_SIZE) }
|
||||||
|
|
||||||
|
/** Stds for the two Gaussians for the six channels (noise) */
|
||||||
|
private val NOISE_DATA_STDS = intArrayOf(378, 1064, 493, 582, 688, 593, 474, 697, 475, 688, 421, 455)
|
||||||
|
.apply { assert(size == TABLE_SIZE) }
|
||||||
|
|
||||||
|
/** Stds for the two Gaussians for the six channels (speech) */
|
||||||
|
private val SPEECH_DATA_STDS = intArrayOf(555, 505, 567, 524, 585, 1231, 509, 828, 492, 1540, 1079, 850)
|
||||||
|
.apply { assert(size == TABLE_SIZE) }
|
||||||
|
|
||||||
|
/** Maximum number of counted speech (VAD = 1) frames in a row */
|
||||||
|
private const val MAX_SPEECH_FRAMES = 6
|
||||||
|
|
||||||
|
/** Minimum standard deviation for both speech and noise */
|
||||||
|
private const val MIN_STD = 384
|
||||||
|
|
||||||
|
/** Number of frequency bands (named channels) */
|
||||||
|
private const val CHANNEL_COUNT = 6
|
||||||
|
|
||||||
|
/** Number of Gaussians per channel in the GMM */
|
||||||
|
private const val GAUSSIAN_COUNT = 2
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Size of a table containing one value per channel and Gaussian.
|
||||||
|
* Indexed as gaussian * CHANNEL_COUNT + channel.
|
||||||
|
*/
|
||||||
|
private const val TABLE_SIZE = CHANNEL_COUNT * GAUSSIAN_COUNT
|
||||||
|
|
||||||
|
// Adjustment for division with two in splitFilter.
|
||||||
|
private val SPLIT_FILTER_OFFSETS = intArrayOf(368, 368, 272, 176, 176, 176)
|
||||||
|
|
||||||
|
private const val SMOOTHING_DOWN = 6553 // 0.2 in Q15.
|
||||||
|
private const val SMOOTHING_UP = 32439 // 0.99 in Q15.
|
||||||
|
|
||||||
|
/** Performs a safe integer division, returning [Int.MAX_VALUE] if [denominator] = 0. */
|
||||||
|
private infix fun Int.safeDiv(denominator: Int) = if (denominator != 0) this / denominator else Int.MAX_VALUE
|
||||||
|
|
||||||
|
private data class GaussianProbabilityResult(
|
||||||
|
/** (probability for input) = 1 / std * exp(-(input - mean)^2 / (2 * std^2)) */
|
||||||
|
val probability: Int,
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Input used when updating the model, Q11.
|
||||||
|
* delta = (input - mean) / std^2.
|
||||||
|
*/
|
||||||
|
val delta: Int
|
||||||
|
)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculates the probability for [input], given that [input] comes from a normal distribution with
|
||||||
|
* mean [mean] and standard deviation [std].
|
||||||
|
*
|
||||||
|
* @param [input] Input sample in Q4.
|
||||||
|
* @param [mean] Mean input in the statistical model, Q7.
|
||||||
|
* @param [std] Standard deviation, Q7.
|
||||||
|
*/
|
||||||
|
private fun getGaussianProbability(input: Int, mean: Int, std: Int): GaussianProbabilityResult {
|
||||||
|
var tmp16: Int
|
||||||
|
var expValue = 0
|
||||||
|
|
||||||
|
// Calculate invStd = 1 / s, in Q10.
|
||||||
|
// 131072 = 1 in Q17, and (std shr 1) is for rounding instead of truncation.
|
||||||
|
// Q-domain: Q17 / Q7 = Q10
|
||||||
|
var tmp32 = 131072 + (std shr 1)
|
||||||
|
val invStd = tmp32 safeDiv std
|
||||||
|
|
||||||
|
// Calculate inv_std2 = 1 / s^2, in Q14
|
||||||
|
tmp16 = invStd shr 2 // Q10 -> Q8.
|
||||||
|
// Q-domain: (Q8 * Q8) shr 2 = Q14
|
||||||
|
val invStd2 = (tmp16 * tmp16) shr 2
|
||||||
|
|
||||||
|
tmp16 = input shl 3 // Q4 -> Q7
|
||||||
|
tmp16 -= mean // Q7 - Q7 = Q7
|
||||||
|
|
||||||
|
// To be used later, when updating noise/speech model.
|
||||||
|
// delta = (x - m) / s^2, in Q11.
|
||||||
|
// Q-domain: (Q14 * Q7) shr 10 = Q11
|
||||||
|
val delta = (invStd2 * tmp16) shr 10
|
||||||
|
|
||||||
|
// Calculate the exponent tmp32 = (x - m)^2 / (2 * s^2), in Q10.
|
||||||
|
// Replacing division by two with one shift.
|
||||||
|
// Q-domain: (Q11 * Q7) shr 8 = Q10.
|
||||||
|
tmp32 = (delta * tmp16) shr 9
|
||||||
|
|
||||||
|
// If the exponent is small enough to give a non-zero probability, we calculate
|
||||||
|
// exp_value ~= exp(-(x - m)^2 / (2 * s^2))
|
||||||
|
// ~= exp2(-log2(exp(1)) * tmp32)
|
||||||
|
if (tmp32 < COMP_VAR) {
|
||||||
|
// Calculate tmp16 = log2(exp(1)) * tmp32, in Q10.
|
||||||
|
// Q-domain: (Q12 * Q10) shr 12 = Q10.
|
||||||
|
tmp16 = (LOG2EXP * tmp32) shr 12
|
||||||
|
tmp16 = -tmp16
|
||||||
|
expValue = 0x0400 or (tmp16 and 0x03FF)
|
||||||
|
tmp16 = tmp16 xor 0xFFFF
|
||||||
|
tmp16 = tmp16 shr 10
|
||||||
|
tmp16 += 1
|
||||||
|
// Get expValue = exp(-tmp32) in Q10.
|
||||||
|
expValue = expValue shr tmp16
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate and return (1 / s) * exp(-(x - m)^2 / (2 * s^2)), in Q20.
|
||||||
|
// Q-domain: Q10 * Q10 = Q20.
|
||||||
|
val probability = invStd * expValue
|
||||||
|
return GaussianProbabilityResult(probability, delta)
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculates the weighted average with regard to number of Gaussians.
|
||||||
|
* CAUTION: Modifies [data] by adding the specified offset to each element.
|
||||||
|
*
|
||||||
|
* @param[data] Data to average.
|
||||||
|
* @param[offset] An offset to add to each element of [data].
|
||||||
|
* @param[weights] Weights used for averaging.
|
||||||
|
* @return The weighted average.
|
||||||
|
*/
|
||||||
|
private fun getWeightedAverage(data: IntArray, channel: Int, offset: Int, weights: IntArray): Int {
|
||||||
|
var result = 0
|
||||||
|
for (k in 0 until GAUSSIAN_COUNT) {
|
||||||
|
val index = k * CHANNEL_COUNT + channel
|
||||||
|
data[index] += offset
|
||||||
|
result += data[index] * weights[index]
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The VAD operating modes in order of increasing aggressiveness.
|
||||||
|
* A more aggressive VAD is more restrictive in reporting speech. Put in other words, the
|
||||||
|
* probability of being speech when the VAD returns 1 is increased with increasing mode. As a
|
||||||
|
* consequence, the missed detection rate also goes up.
|
||||||
|
*/
|
||||||
|
enum class Aggressiveness {
|
||||||
|
Quality,
|
||||||
|
LowBitrate,
|
||||||
|
Aggressive,
|
||||||
|
VeryAggressive
|
||||||
|
}
|
||||||
|
|
||||||
|
class VoiceActivityDetector(aggressiveness: Aggressiveness = Aggressiveness.Quality) {
|
||||||
|
private var frameCount: Int = 0
|
||||||
|
private var overhang: Int = 0
|
||||||
|
private var speechFrameCount: Int = 0
|
||||||
|
|
||||||
|
// PDF parameters
|
||||||
|
private val noiseMeans = NOISE_DATA_MEANS.clone()
|
||||||
|
private val speechMeans = SPEECH_DATA_MEANS.clone()
|
||||||
|
private val noiseStds = NOISE_DATA_STDS.clone()
|
||||||
|
private val speechStds = SPEECH_DATA_STDS.clone()
|
||||||
|
|
||||||
|
private val ageVector = IntArray(16 * CHANNEL_COUNT) { 0 }
|
||||||
|
private val minimumValueVector = IntArray(16 * CHANNEL_COUNT) { 10000 }
|
||||||
|
|
||||||
|
// Splitting filter states
|
||||||
|
private val upperState = List(5) { MutableInt(0) }
|
||||||
|
private val lowerState = List(5) { MutableInt(0) }
|
||||||
|
|
||||||
|
// High pass filter states
|
||||||
|
private val highPassFilterState = IntArray(4) { 0 }
|
||||||
|
|
||||||
|
// Median value memory for findMinimum()
|
||||||
|
private val median = IntArray(CHANNEL_COUNT) { 1600 }
|
||||||
|
|
||||||
|
private data class ThresholdsRecord(
|
||||||
|
val overhangMax1: Int,
|
||||||
|
val overhangMax2: Int,
|
||||||
|
val localThreshold: Int,
|
||||||
|
val globalThreshold: Int
|
||||||
|
)
|
||||||
|
|
||||||
|
private val thresholds = when(aggressiveness) {
|
||||||
|
Aggressiveness.Quality -> ThresholdsRecord(8, 14, 24, 57)
|
||||||
|
Aggressiveness.LowBitrate -> ThresholdsRecord(8, 14, 37, 100)
|
||||||
|
Aggressiveness.Aggressive -> ThresholdsRecord(6, 9, 82, 285)
|
||||||
|
Aggressiveness.VeryAggressive -> ThresholdsRecord(6, 9, 94, 1100)
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculates the probabilities for both speech and background noise using Gaussian Mixture
|
||||||
|
* Models (GMM). A hypothesis-test is performed to decide which type of signal is most probable.
|
||||||
|
*
|
||||||
|
* @param[features] Feature vector of length [CHANNEL_COUNT] = log10(energy in frequency band)
|
||||||
|
* @param[totalEnergy] Total energy in audio frame.
|
||||||
|
* @param[frameLength] Number of input samples.
|
||||||
|
* @return VAD decision. 0: no active speech; 1-6: active speech
|
||||||
|
*/
|
||||||
|
private fun getGmmProbability(features: List<Int>, totalEnergy: Int, frameLength: Int): Int {
|
||||||
|
var vadFlag = 0
|
||||||
|
var tmp: Int
|
||||||
|
var tmp1: Int
|
||||||
|
var tmp2: Int
|
||||||
|
val deltaN = IntArray(TABLE_SIZE)
|
||||||
|
val deltaS = IntArray(TABLE_SIZE)
|
||||||
|
val ngprvec = IntArray(TABLE_SIZE) { 0 } // Conditional probability = 0.
|
||||||
|
val sgprvec = IntArray(TABLE_SIZE) { 0 } // Conditional probability = 0.
|
||||||
|
var sumLogLikelihoodRatios = 0
|
||||||
|
val noiseProbability = IntArray(GAUSSIAN_COUNT)
|
||||||
|
val speechProbability = IntArray(GAUSSIAN_COUNT)
|
||||||
|
|
||||||
|
assert(frameLength == 80)
|
||||||
|
|
||||||
|
if (totalEnergy > MIN_ENERGY) {
|
||||||
|
// The signal power of current frame is large enough for processing. The processing
|
||||||
|
// consists of two parts:
|
||||||
|
// 1) Calculating the likelihood of speech and thereby a VAD decision.
|
||||||
|
// 2) Updating the underlying model, w.r.t., the decision made.
|
||||||
|
|
||||||
|
// The detection scheme is an LRT with hypothesis
|
||||||
|
// H0: Noise
|
||||||
|
// H1: Speech
|
||||||
|
//
|
||||||
|
// We combine a global LRT with local tests, for each frequency sub-band, here named
|
||||||
|
// channel.
|
||||||
|
for (channel in 0 until CHANNEL_COUNT) {
|
||||||
|
// For each channel we model the probability with a GMM consisting of
|
||||||
|
// GAUSSIAN_COUNT, with different means and standard deviations depending
|
||||||
|
// on H0 or H1.
|
||||||
|
var h0Test = 0
|
||||||
|
var h1Test = 0
|
||||||
|
for (k in 0 until GAUSSIAN_COUNT) {
|
||||||
|
val gaussian = channel + k * CHANNEL_COUNT
|
||||||
|
|
||||||
|
// Probability under H0, that is, probability of frame being noise.
|
||||||
|
// Value given in Q27 = Q7 * Q20.
|
||||||
|
val pNoise = getGaussianProbability(features[channel], noiseMeans[gaussian], noiseStds[gaussian])
|
||||||
|
deltaN[gaussian] = pNoise.delta
|
||||||
|
noiseProbability[k] = NOISE_DATA_WEIGHTS[gaussian] * pNoise.probability
|
||||||
|
h0Test += noiseProbability[k] // Q27
|
||||||
|
|
||||||
|
// Probability under H1, that is, probability of frame being speech.
|
||||||
|
// Value given in Q27 = Q7 * Q20.
|
||||||
|
val pSpeech = getGaussianProbability(features[channel], speechMeans[gaussian], speechStds[gaussian])
|
||||||
|
speechProbability[k] = SPEECH_DATA_WEIGHTS[gaussian] * pSpeech.probability
|
||||||
|
deltaS[gaussian] = pSpeech.delta
|
||||||
|
h1Test += speechProbability[k] // Q27
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate the log likelihood ratio: log2(Pr{X|H1} / Pr{X|H1}).
|
||||||
|
// Approximation:
|
||||||
|
// log2(Pr{X|H1} / Pr{X|H1}) = log2(Pr{X|H1}*2^Q) - log2(Pr{X|H1}*2^Q)
|
||||||
|
// = log2(h1_test) - log2(h0_test)
|
||||||
|
// = log2(2^(31-shifts_h1)*(1+b1))
|
||||||
|
// - log2(2^(31-shifts_h0)*(1+b0))
|
||||||
|
// = shifts_h0 - shifts_h1
|
||||||
|
// + log2(1+b1) - log2(1+b0)
|
||||||
|
// ~= shifts_h0 - shifts_h1
|
||||||
|
//
|
||||||
|
// Note that b0 and b1 are values less than 1, hence, 0 <= log2(1+b0) < 1.
|
||||||
|
// Further, b0 and b1 are independent and on the average the two terms cancel.
|
||||||
|
val shiftsH0 = if (h0Test != 0) normSigned(h0Test) else 31
|
||||||
|
val shiftsH1 = if (h1Test != 0) normSigned(h1Test) else 31
|
||||||
|
val logLikelihoodRatio = shiftsH0 - shiftsH1
|
||||||
|
|
||||||
|
// Update sumLogLikelihoodRatios with spectrum weighting.
|
||||||
|
// This is used for the global VAD decision.
|
||||||
|
sumLogLikelihoodRatios += logLikelihoodRatio * SPECTRUM_WEIGHT[channel]
|
||||||
|
|
||||||
|
// Local VAD decision.
|
||||||
|
if ((logLikelihoodRatio * 4) > thresholds.localThreshold) {
|
||||||
|
vadFlag = 1
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate local noise probabilities used later when updating the GMM.
|
||||||
|
val h0 = h0Test shr 12 // Q15
|
||||||
|
if (h0 > 0) {
|
||||||
|
// High probability of noise. Assign conditional probabilities for each
|
||||||
|
// Gaussian in the GMM.
|
||||||
|
val tmp3 = (noiseProbability[0] and 0xFFFFF000u.toInt()) shl 2 // Q29
|
||||||
|
ngprvec[channel] = tmp3 safeDiv h0 // Q14
|
||||||
|
ngprvec[channel + CHANNEL_COUNT] = 16384 - ngprvec[channel]
|
||||||
|
} else {
|
||||||
|
// Low noise probability. Assign conditional probability 1 to the first
|
||||||
|
// Gaussian and 0 to the rest (which is already set at initialization).
|
||||||
|
ngprvec[channel] = 16384
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate local speech probabilities used later when updating the GMM.
|
||||||
|
val h1 = h1Test shr 12 // Q15
|
||||||
|
if (h1 > 0) {
|
||||||
|
// High probability of speech. Assign conditional probabilities for each
|
||||||
|
// Gaussian in the GMM. Otherwise use the initialized values, i.e., 0.
|
||||||
|
val tmp3 = (speechProbability[0] and 0xFFFFF000u.toInt()) shl 2 // Q29
|
||||||
|
sgprvec[channel] = tmp3 safeDiv h1 // Q14
|
||||||
|
sgprvec[channel + CHANNEL_COUNT] = 16384 - sgprvec[channel]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Make a global VAD decision.
|
||||||
|
vadFlag = vadFlag or (if (sumLogLikelihoodRatios >= thresholds.globalThreshold) 1 else 0)
|
||||||
|
|
||||||
|
// Update the model parameters.
|
||||||
|
var maxspe = 12800
|
||||||
|
for (channel in 0 until CHANNEL_COUNT) {
|
||||||
|
// Get minimum value in past which is used for long term correction in Q4.
|
||||||
|
val featureMinimum = findMinimum(features[channel], channel)
|
||||||
|
|
||||||
|
// Compute the "global" mean, that is the sum of the two means weighted.
|
||||||
|
var noiseGlobalMean = getWeightedAverage(noiseMeans, channel, 0, NOISE_DATA_WEIGHTS)
|
||||||
|
tmp1 = noiseGlobalMean shr 6 // Q8
|
||||||
|
|
||||||
|
for (k in 0 until GAUSSIAN_COUNT) {
|
||||||
|
val gaussian = channel + k * CHANNEL_COUNT
|
||||||
|
|
||||||
|
val nmk = noiseMeans[gaussian]
|
||||||
|
val smk = speechMeans[gaussian]
|
||||||
|
var nsk = noiseStds[gaussian]
|
||||||
|
var ssk = speechStds[gaussian]
|
||||||
|
|
||||||
|
// Update noise mean vector if the frame consists of noise only.
|
||||||
|
var nmk2 = nmk
|
||||||
|
if (vadFlag == 0) {
|
||||||
|
// deltaN = (x-mu)/sigma^2
|
||||||
|
// ngprvec[k] = noiseProbability[k] / (noiseProbability[0] + noiseProbability[1])
|
||||||
|
|
||||||
|
// (Q14 * Q11 shr 11) = Q14.
|
||||||
|
val delt = (ngprvec[gaussian] * deltaN[gaussian]) shr 11
|
||||||
|
// Q7 + (Q14 * Q15 shr 22) = Q7.
|
||||||
|
nmk2 = nmk + ((delt * NOISE_UPDATE_CONST) shr 22)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Long term correction of the noise mean.
|
||||||
|
// Q8 - Q8 = Q8.
|
||||||
|
val ndelt = (featureMinimum shl 4) - tmp1
|
||||||
|
// Q7 + (Q8 * Q8) shr 9 = Q7.
|
||||||
|
var nmk3 = nmk2 + ((ndelt * BACK_ETA) shr 9)
|
||||||
|
|
||||||
|
// Control that the noise mean does not drift to much.
|
||||||
|
tmp = (k + 5) shl 7
|
||||||
|
if (nmk3 < tmp) {
|
||||||
|
nmk3 = tmp
|
||||||
|
}
|
||||||
|
tmp = (72 + k - channel) shl 7
|
||||||
|
if (nmk3 > tmp) {
|
||||||
|
nmk3 = tmp
|
||||||
|
}
|
||||||
|
noiseMeans[gaussian] = nmk3
|
||||||
|
|
||||||
|
if (vadFlag != 0) {
|
||||||
|
// Update speech mean vector:
|
||||||
|
// deltaS = (x-mu)/sigma^2
|
||||||
|
// sgprvec[k] = speechProbability[k] / (speechProbability[0] + speechProbability[1])
|
||||||
|
|
||||||
|
// (Q14 * Q11) shr 11 = Q14.
|
||||||
|
val delt = (sgprvec[gaussian] * deltaS[gaussian]) shr 11
|
||||||
|
// Q14 * Q15 shr 21 = Q8.
|
||||||
|
tmp = (delt * SPEECH_UPDATE_CONST) shr 21
|
||||||
|
// Q7 + (Q8 shr 1) = Q7. With rounding.
|
||||||
|
var smk2 = smk + ((tmp + 1) shr 1)
|
||||||
|
|
||||||
|
// Control that the speech mean does not drift to much.
|
||||||
|
val maxmu = maxspe + 640
|
||||||
|
if (smk2 < MINIMUM_MEAN[k]) {
|
||||||
|
smk2 = MINIMUM_MEAN[k]
|
||||||
|
}
|
||||||
|
if (smk2 > maxmu) {
|
||||||
|
smk2 = maxmu
|
||||||
|
}
|
||||||
|
speechMeans[gaussian] = smk2 // Q7.
|
||||||
|
|
||||||
|
// (Q7 shr 3) = Q4. With rounding.
|
||||||
|
tmp = (smk + 4) shr 3
|
||||||
|
|
||||||
|
tmp = features[channel] - tmp // Q4
|
||||||
|
// (Q11 * Q4 shr 3) = Q12.
|
||||||
|
var tmp4 = (deltaS[gaussian] * tmp) shr 3
|
||||||
|
var tmp5 = tmp4 - 4096
|
||||||
|
tmp = sgprvec[gaussian] shr 2
|
||||||
|
// (Q14 shr 2) * Q12 = Q24.
|
||||||
|
tmp4 = tmp * tmp5
|
||||||
|
|
||||||
|
tmp5 = tmp4 shr 4 // Q20
|
||||||
|
|
||||||
|
// 0.1 * Q20 / Q7 = Q13.
|
||||||
|
if (tmp5 > 0) {
|
||||||
|
tmp = tmp5 safeDiv (ssk * 10)
|
||||||
|
} else {
|
||||||
|
tmp = -tmp5 safeDiv (ssk * 10)
|
||||||
|
tmp = -tmp
|
||||||
|
}
|
||||||
|
// Divide by 4 giving an update factor of 0.025 (= 0.1 / 4).
|
||||||
|
// Note that division by 4 equals shift by 2, hence,
|
||||||
|
// (Q13 shr 8) = (Q13 shr 6) / 4 = Q7.
|
||||||
|
tmp += 128 // Rounding.
|
||||||
|
ssk += (tmp shr 8)
|
||||||
|
if (ssk < MIN_STD) {
|
||||||
|
ssk = MIN_STD
|
||||||
|
}
|
||||||
|
speechStds[gaussian] = ssk
|
||||||
|
} else {
|
||||||
|
// Update GMM variance vectors.
|
||||||
|
// deltaN * (features[channel] - nmk) - 1
|
||||||
|
// Q4 - (Q7 shr 3) = Q4.
|
||||||
|
tmp = features[channel] - (nmk shr 3)
|
||||||
|
// (Q11 * Q4 shr 3) = Q12.
|
||||||
|
var tmp5 = (deltaN[gaussian] * tmp) shr 3
|
||||||
|
tmp5 -= 4096
|
||||||
|
|
||||||
|
// (Q14 shr 2) * Q12 = Q24.
|
||||||
|
tmp = (ngprvec[gaussian] + 2) shr 2
|
||||||
|
val tmp4 = tmp * tmp5
|
||||||
|
// Q20 * approx 0.001 (2^-10=0.0009766), hence,
|
||||||
|
// (Q24 shr 14) = (Q24 shr 4) / 2^10 = Q20.
|
||||||
|
tmp5 = tmp4 shr 14
|
||||||
|
|
||||||
|
// Q20 / Q7 = Q13.
|
||||||
|
if (tmp5 > 0) {
|
||||||
|
tmp = tmp5 safeDiv nsk
|
||||||
|
} else {
|
||||||
|
tmp = -tmp5 safeDiv nsk
|
||||||
|
tmp = -tmp
|
||||||
|
}
|
||||||
|
tmp += 32 // Rounding
|
||||||
|
nsk += tmp shr 6 // Q13 shr 6 = Q7.
|
||||||
|
if (nsk < MIN_STD) {
|
||||||
|
nsk = MIN_STD
|
||||||
|
}
|
||||||
|
noiseStds[gaussian] = nsk
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Separate models if they are too close.
|
||||||
|
// noiseGlobalMean in Q14 (= Q7 * Q7).
|
||||||
|
noiseGlobalMean = getWeightedAverage(noiseMeans, channel, 0, NOISE_DATA_WEIGHTS)
|
||||||
|
|
||||||
|
// speechGlobalMean in Q14 (= Q7 * Q7).
|
||||||
|
var speechGlobalMean = getWeightedAverage(speechMeans, channel, 0, SPEECH_DATA_WEIGHTS)
|
||||||
|
|
||||||
|
// diff = "global" speech mean - "global" noise mean.
|
||||||
|
// (Q14 shr 9) - (Q14 shr 9) = Q5.
|
||||||
|
val diff = (speechGlobalMean shr 9) - (noiseGlobalMean shr 9)
|
||||||
|
if (diff < MINIMUM_DIFFERENCE[channel]) {
|
||||||
|
tmp = MINIMUM_DIFFERENCE[channel] - diff
|
||||||
|
|
||||||
|
// tmp1_s16 = ~0.8 * (MINIMUM_DIFFERENCE - diff) in Q7.
|
||||||
|
// tmp2_s16 = ~0.2 * (MINIMUM_DIFFERENCE - diff) in Q7.
|
||||||
|
tmp1 = (13 * tmp) shr 2
|
||||||
|
tmp2 = (3 * tmp) shr 2
|
||||||
|
|
||||||
|
// Move Gaussian means for speech model by tmp1 and update speechGlobalMean.
|
||||||
|
// Note that speechMeans[channel] is changed after the call.
|
||||||
|
speechGlobalMean = getWeightedAverage(speechMeans, channel, tmp1, SPEECH_DATA_WEIGHTS)
|
||||||
|
|
||||||
|
// Move Gaussian means for noise model by -tmp2 and update noiseGlobalMean.
|
||||||
|
// Note that noiseMeans[channel] is
|
||||||
|
// changed after the call.
|
||||||
|
noiseGlobalMean = getWeightedAverage(noiseMeans, channel, -tmp2, NOISE_DATA_WEIGHTS)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Control that the speech & noise means do not drift to much.
|
||||||
|
maxspe = MAXIMUM_SPEECH[channel]
|
||||||
|
tmp2 = speechGlobalMean shr 7
|
||||||
|
if (tmp2 > maxspe) {
|
||||||
|
// Upper limit of speech model.
|
||||||
|
tmp2 -= maxspe
|
||||||
|
|
||||||
|
for (k in 0 until GAUSSIAN_COUNT) {
|
||||||
|
speechMeans[channel + k * CHANNEL_COUNT] -= tmp2
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
tmp2 = noiseGlobalMean shr 7
|
||||||
|
if (tmp2 > MAXIMUM_NOISE[channel]) {
|
||||||
|
tmp2 -= MAXIMUM_NOISE[channel]
|
||||||
|
|
||||||
|
for (k in 0 until GAUSSIAN_COUNT) {
|
||||||
|
noiseMeans[channel + k * CHANNEL_COUNT] -= tmp2
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
frameCount++
|
||||||
|
}
|
||||||
|
|
||||||
|
// Smooth with respect to transition hysteresis.
|
||||||
|
if (vadFlag == 0) {
|
||||||
|
if (overhang > 0) {
|
||||||
|
vadFlag = 2 + overhang
|
||||||
|
overhang--
|
||||||
|
}
|
||||||
|
speechFrameCount = 0
|
||||||
|
} else {
|
||||||
|
speechFrameCount++
|
||||||
|
if (speechFrameCount > MAX_SPEECH_FRAMES) {
|
||||||
|
speechFrameCount = MAX_SPEECH_FRAMES
|
||||||
|
overhang = thresholds.overhangMax2
|
||||||
|
} else {
|
||||||
|
overhang = thresholds.overhangMax1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return vadFlag
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Updates and returns the smoothed feature minimum. As minimum we use the median of the five
|
||||||
|
* smallest feature values in a 100 frames long window.
|
||||||
|
*
|
||||||
|
* Inserts [featureValue] into [minimumValueVector], if it is one of the 16 smallest values the
|
||||||
|
* last 100 frames. Then calculates and returns the median of the five smallest values.
|
||||||
|
*
|
||||||
|
* As long as [frameCount] is zero, that is, we haven't received any "valid" data, [findMinimum]
|
||||||
|
* outputs the default value of 1600.
|
||||||
|
*
|
||||||
|
* @param[featureValue] New feature value to update with.
|
||||||
|
* @param[channel] Channel number.
|
||||||
|
* @return Smoothed minimum value for a moving window.
|
||||||
|
*/
|
||||||
|
private fun findMinimum(featureValue: Int, channel: Int): Int {
|
||||||
|
var position = -1
|
||||||
|
var currentMedian = 1600
|
||||||
|
var alpha = 0
|
||||||
|
var tmp: Int
|
||||||
|
val offset = channel * 16
|
||||||
|
|
||||||
|
// Accessor for the age of each value of the channel
|
||||||
|
val age = object {
|
||||||
|
operator fun get(i: Int) = ageVector[offset + i]
|
||||||
|
operator fun set(i: Int, value: Int) { ageVector[offset + i] = value }
|
||||||
|
}
|
||||||
|
|
||||||
|
// Accessor for the 16 minimum values of the channel
|
||||||
|
val smallestValues = object {
|
||||||
|
operator fun get(i: Int) = minimumValueVector[offset + i]
|
||||||
|
operator fun set(i: Int, value: Int) { minimumValueVector[offset + i] = value }
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(channel < CHANNEL_COUNT)
|
||||||
|
|
||||||
|
// Each value in smallestValues is getting 1 loop older. Update age and remove old values.
|
||||||
|
for (i in 0 until 16) {
|
||||||
|
if (age[i] != 100) {
|
||||||
|
age[i]++
|
||||||
|
} else {
|
||||||
|
// Too old value. Remove from memory and shift larger values downwards.
|
||||||
|
for (j in i until 15) {
|
||||||
|
smallestValues[j] = smallestValues[j + 1]
|
||||||
|
age[j] = age[j + 1]
|
||||||
|
}
|
||||||
|
age[15] = 101
|
||||||
|
smallestValues[15] = 10000
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if featureValue is smaller than any of the values in smallest_values.
|
||||||
|
// If so, find the position where to insert the new value.
|
||||||
|
if (featureValue < smallestValues[7]) {
|
||||||
|
position = when {
|
||||||
|
featureValue < smallestValues[3] ->
|
||||||
|
when {
|
||||||
|
featureValue < smallestValues[1] -> if (featureValue < smallestValues[0]) 0 else 1
|
||||||
|
featureValue < smallestValues[2] -> 2
|
||||||
|
else -> 3
|
||||||
|
}
|
||||||
|
featureValue < smallestValues[5] -> if (featureValue < smallestValues[4]) 4 else 5
|
||||||
|
featureValue < smallestValues[6] -> 6
|
||||||
|
else -> 7
|
||||||
|
}
|
||||||
|
} else if (featureValue < smallestValues[15]) {
|
||||||
|
position = when {
|
||||||
|
featureValue < smallestValues[11] -> when {
|
||||||
|
featureValue < smallestValues[9] -> if (featureValue < smallestValues[8]) 8 else 9
|
||||||
|
featureValue < smallestValues[10] -> 10
|
||||||
|
else -> 11
|
||||||
|
}
|
||||||
|
featureValue < smallestValues[13] -> if (featureValue < smallestValues[12]) 12 else 13
|
||||||
|
featureValue < smallestValues[14] -> 14
|
||||||
|
else -> 15
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we have detected a new small value, insert it at the correct position and shift larger
|
||||||
|
// values up.
|
||||||
|
if (position > -1) {
|
||||||
|
for (i in 15 downTo position + 1) {
|
||||||
|
smallestValues[i] = smallestValues[i - 1]
|
||||||
|
age[i] = age[i - 1]
|
||||||
|
}
|
||||||
|
smallestValues[position] = featureValue
|
||||||
|
age[position] = 1
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get currentMedian
|
||||||
|
if (frameCount > 2) {
|
||||||
|
currentMedian = smallestValues[2]
|
||||||
|
} else if (frameCount > 0) {
|
||||||
|
currentMedian = smallestValues[0]
|
||||||
|
}
|
||||||
|
|
||||||
|
// Smooth the median value.
|
||||||
|
if (frameCount > 0) {
|
||||||
|
alpha = if (currentMedian < median[channel]) {
|
||||||
|
SMOOTHING_DOWN // 0.2 in Q15.
|
||||||
|
} else {
|
||||||
|
SMOOTHING_UP // 0.99 in Q15.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
tmp = (alpha + 1) * median[channel]
|
||||||
|
tmp += (Short.MAX_VALUE - alpha) * currentMedian
|
||||||
|
tmp += 16384
|
||||||
|
median[channel] = tmp shr 15
|
||||||
|
|
||||||
|
return median[channel]
|
||||||
|
}
|
||||||
|
|
||||||
|
private data class FeatureResult(
|
||||||
|
// 10 * log10(energy in each frequency band), Q4
|
||||||
|
val features: List<Int>,
|
||||||
|
// Total energy of the signal
|
||||||
|
// NOTE: This value is not exact. It is only used in a comparison.
|
||||||
|
val totalEnergy: Int
|
||||||
|
)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Takes an audio buffer and calculates the logarithm of the energy of each of the
|
||||||
|
* [CHANNEL_COUNT] = 6 frequency bands used by the VAD:
|
||||||
|
* 80-250 Hz, 250-500 Hz, 500-1000 Hz, 1000-2000 Hz, 2000-3000 Hz, 3000-4000 Hz.
|
||||||
|
*
|
||||||
|
* The values are given in Q4 and written to features. Further, an approximate overall energy is
|
||||||
|
* returned. The return value is used in [getGmmProbability] as a signal indicator, hence it is
|
||||||
|
* arbitrary above the threshold [MIN_ENERGY].
|
||||||
|
*/
|
||||||
|
private fun calculateFeatures(input: AudioBuffer): FeatureResult {
|
||||||
|
assert(input.size == 80)
|
||||||
|
|
||||||
|
// Split at 2000 Hz and downsample.
|
||||||
|
var frequencyBand = 0
|
||||||
|
val `0 to 4000 Hz` = input
|
||||||
|
val (`2000 to 4000 Hz`, `0 to 2000 Hz`) =
|
||||||
|
splitFilter(`0 to 4000 Hz`, upperState[frequencyBand], lowerState[frequencyBand])
|
||||||
|
|
||||||
|
// For the upper band (2000 to 4000 Hz) split at 3000 Hz and downsample.
|
||||||
|
frequencyBand = 1
|
||||||
|
val (`3000 to 4000 Hz`, `2000 to 3000 Hz`) =
|
||||||
|
splitFilter(`2000 to 4000 Hz`, upperState[frequencyBand], lowerState[frequencyBand])
|
||||||
|
|
||||||
|
// For the lower band (0 to 2000 Hz) split at 1000 Hz and downsample.
|
||||||
|
frequencyBand = 2
|
||||||
|
val (`1000 to 2000 Hz`, `0 to 1000 Hz`) =
|
||||||
|
splitFilter(`0 to 2000 Hz`, upperState[frequencyBand], lowerState[frequencyBand])
|
||||||
|
|
||||||
|
// For the lower band (0 to 1000 Hz) split at 500 Hz and downsample.
|
||||||
|
frequencyBand = 3
|
||||||
|
val (`500 to 1000 Hz`, `0 to 500 Hz`) =
|
||||||
|
splitFilter(`0 to 1000 Hz`, upperState[frequencyBand], lowerState[frequencyBand])
|
||||||
|
|
||||||
|
// For the lower band (0 t0 500 Hz) split at 250 Hz and downsample.
|
||||||
|
frequencyBand = 4
|
||||||
|
val (`250 to 500 Hz`, `0 to 250 Hz`) =
|
||||||
|
splitFilter(`0 to 500 Hz`, upperState[frequencyBand], lowerState[frequencyBand])
|
||||||
|
|
||||||
|
// Remove 0 to 80 Hz by high pass filtering the lower band.
|
||||||
|
val `80 to 250 Hz` = highPassFilter(`0 to 250 Hz`, highPassFilterState)
|
||||||
|
|
||||||
|
val totalEnergy = MutableInt(0)
|
||||||
|
val `energy in 3000 to 4000 Hz` = logOfEnergy(`3000 to 4000 Hz`, SPLIT_FILTER_OFFSETS[5], totalEnergy)
|
||||||
|
val `energy in 2000 to 3000 Hz` = logOfEnergy(`2000 to 3000 Hz`, SPLIT_FILTER_OFFSETS[4], totalEnergy)
|
||||||
|
val `energy in 1000 to 2000 Hz` = logOfEnergy(`1000 to 2000 Hz`, SPLIT_FILTER_OFFSETS[3], totalEnergy)
|
||||||
|
val `energy in 500 to 1000 Hz` = logOfEnergy(`500 to 1000 Hz`, SPLIT_FILTER_OFFSETS[2], totalEnergy)
|
||||||
|
val `energy in 250 to 500 Hz` = logOfEnergy(`250 to 500 Hz`, SPLIT_FILTER_OFFSETS[1], totalEnergy)
|
||||||
|
val `energy in 50 to 250 Hz` = logOfEnergy(`80 to 250 Hz`, SPLIT_FILTER_OFFSETS[0], totalEnergy)
|
||||||
|
|
||||||
|
val features = listOf(
|
||||||
|
`energy in 50 to 250 Hz`,
|
||||||
|
`energy in 250 to 500 Hz`,
|
||||||
|
`energy in 500 to 1000 Hz`,
|
||||||
|
`energy in 1000 to 2000 Hz`,
|
||||||
|
`energy in 2000 to 3000 Hz`,
|
||||||
|
`energy in 3000 to 4000 Hz`
|
||||||
|
)
|
||||||
|
assert(features.size == CHANNEL_COUNT)
|
||||||
|
return FeatureResult(features, totalEnergy.toInt())
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculates a VAD decision for the specified audio frame, which must be 100 ms long and
|
||||||
|
* sampled at 8 kHz.
|
||||||
|
*
|
||||||
|
* @return If true, the frame contains voice activity.
|
||||||
|
*/
|
||||||
|
fun process(audioFrame: AudioBuffer): Boolean {
|
||||||
|
// Get power in the bands
|
||||||
|
val (features, totalEnergy) = calculateFeatures(audioFrame)
|
||||||
|
|
||||||
|
// Make a VAD
|
||||||
|
val vadResult = getGmmProbability(features, totalEnergy, audioFrame.size)
|
||||||
|
|
||||||
|
// return vad != 0
|
||||||
|
return vadResult == 1
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue