Removing DC offset from audio
Also a bit of refactoring regarding audio processing
This commit is contained in:
parent
af5a6649c1
commit
a8900f80ec
|
@ -102,7 +102,9 @@ set(SOURCE_FILES
|
|||
src/platformTools.cpp
|
||||
src/tools.cpp
|
||||
src/audio/AudioStream.cpp
|
||||
src/audio/DCOffset.cpp
|
||||
src/audio/SampleRateConverter.cpp
|
||||
src/audio/UnboundedStream.cpp
|
||||
src/audio/WaveFileReader.cpp
|
||||
src/audio/waveFileWriting.cpp
|
||||
src/stringTools.cpp
|
||||
|
|
|
@ -0,0 +1,80 @@
|
|||
#include "DCOffset.h"
|
||||
#include <gsl_util.h>
|
||||
|
||||
DCOffset::DCOffset(std::unique_ptr<AudioStream> inputStream, float offset) :
|
||||
inputStream(std::move(inputStream)),
|
||||
offset(offset),
|
||||
factor(1 / (1 + std::abs(offset)))
|
||||
{}
|
||||
|
||||
DCOffset::DCOffset(const DCOffset& rhs, bool reset) :
|
||||
inputStream(rhs.inputStream->clone(reset)),
|
||||
offset(rhs.offset),
|
||||
factor(rhs.factor)
|
||||
{}
|
||||
|
||||
std::unique_ptr<AudioStream> DCOffset::clone(bool reset) {
|
||||
return std::make_unique<DCOffset>(*this, reset);
|
||||
}
|
||||
|
||||
int DCOffset::getSampleRate() {
|
||||
return inputStream->getSampleRate();
|
||||
}
|
||||
|
||||
int DCOffset::getSampleCount() {
|
||||
return inputStream->getSampleCount();
|
||||
}
|
||||
|
||||
int DCOffset::getSampleIndex() {
|
||||
return inputStream->getSampleIndex();
|
||||
}
|
||||
|
||||
void DCOffset::seek(int sampleIndex) {
|
||||
inputStream->seek(sampleIndex);
|
||||
}
|
||||
|
||||
float DCOffset::readSample() {
|
||||
float sample = inputStream->readSample();
|
||||
return sample * factor + offset;
|
||||
}
|
||||
|
||||
std::unique_ptr<AudioStream> addDCOffset(std::unique_ptr<AudioStream> audioStream, float offset, float epsilon) {
|
||||
if (std::abs(offset) < epsilon) return audioStream;
|
||||
return std::make_unique<DCOffset>(std::move(audioStream), offset);
|
||||
}
|
||||
|
||||
float getDCOffset(AudioStream& audioStream) {
|
||||
int flatMeanSampleCount, fadingMeanSampleCount;
|
||||
int sampleRate = audioStream.getSampleRate();
|
||||
if (audioStream.getSampleCount() > 4 * sampleRate) {
|
||||
// Long audio file. Average over the first 3 seconds, then fade out over the 4th.
|
||||
flatMeanSampleCount = 3 * sampleRate;
|
||||
fadingMeanSampleCount = 1 * sampleRate;
|
||||
} else {
|
||||
// Short audio file. Average over the entire length.
|
||||
flatMeanSampleCount = audioStream.getSampleCount();
|
||||
fadingMeanSampleCount = 0;
|
||||
}
|
||||
|
||||
int originalSampleIndex = audioStream.getSampleIndex();
|
||||
audioStream.seek(0);
|
||||
auto restorePosition = gsl::finally([&]() { audioStream.seek(originalSampleIndex); });
|
||||
|
||||
double sum = 0;
|
||||
for (int i = 0; i < flatMeanSampleCount; i++) {
|
||||
sum += audioStream.readSample();
|
||||
}
|
||||
for (int i = 0; i < fadingMeanSampleCount; i++) {
|
||||
double weight = static_cast<double>(fadingMeanSampleCount - i) / fadingMeanSampleCount;
|
||||
sum += audioStream.readSample() * weight;
|
||||
}
|
||||
|
||||
double totalWeight = flatMeanSampleCount + (fadingMeanSampleCount + 1) / 2.0;
|
||||
double offset = sum / totalWeight;
|
||||
return static_cast<float>(offset);
|
||||
}
|
||||
|
||||
std::unique_ptr<AudioStream> removeDCOffset(std::unique_ptr<AudioStream> inputStream) {
|
||||
float offset = getDCOffset(*inputStream.get());
|
||||
return addDCOffset(std::move(inputStream), -offset);
|
||||
}
|
|
@ -0,0 +1,26 @@
|
|||
#pragma once
|
||||
|
||||
#include "AudioStream.h"
|
||||
|
||||
// Applies a constant DC offset to an audio stream and reduces its amplitude
|
||||
// to prevent clipping
|
||||
class DCOffset : public AudioStream {
|
||||
public:
|
||||
DCOffset(std::unique_ptr<AudioStream> inputStream, float offset);
|
||||
DCOffset(const DCOffset& rhs, bool reset);
|
||||
std::unique_ptr<AudioStream> clone(bool reset) override;
|
||||
int getSampleRate() override;
|
||||
int getSampleCount() override;
|
||||
int getSampleIndex() override;
|
||||
void seek(int sampleIndex) override;
|
||||
float readSample() override;
|
||||
|
||||
private:
|
||||
std::unique_ptr<AudioStream> inputStream;
|
||||
float offset;
|
||||
float factor;
|
||||
};
|
||||
|
||||
std::unique_ptr<AudioStream> addDCOffset(std::unique_ptr<AudioStream> audioStream, float offset, float epsilon = 1.0f / 15000);
|
||||
|
||||
std::unique_ptr<AudioStream> removeDCOffset(std::unique_ptr<AudioStream> audioStream);
|
|
@ -2,25 +2,29 @@
|
|||
#include "SampleRateConverter.h"
|
||||
#include <stdexcept>
|
||||
#include <algorithm>
|
||||
#include <format.h>
|
||||
|
||||
using std::runtime_error;
|
||||
using std::invalid_argument;
|
||||
|
||||
SampleRateConverter::SampleRateConverter(std::unique_ptr<AudioStream> inputStream, int outputFrameRate) :
|
||||
SampleRateConverter::SampleRateConverter(std::unique_ptr<AudioStream> inputStream, int outputSampleRate) :
|
||||
inputStream(std::move(inputStream)),
|
||||
downscalingFactor(static_cast<double>(this->inputStream->getSampleRate()) / outputFrameRate),
|
||||
outputFrameRate(outputFrameRate),
|
||||
outputFrameCount(std::lround(this->inputStream->getSampleCount() / downscalingFactor)),
|
||||
downscalingFactor(static_cast<double>(this->inputStream->getSampleRate()) / outputSampleRate),
|
||||
outputSampleRate(outputSampleRate),
|
||||
outputSampleCount(std::lround(this->inputStream->getSampleCount() / downscalingFactor)),
|
||||
lastInputSample(0),
|
||||
lastInputSampleIndex(-1),
|
||||
nextOutputSampleIndex(0)
|
||||
{
|
||||
if (this->inputStream->getSampleRate() < outputFrameRate) {
|
||||
throw runtime_error("Upsampling not supported.");
|
||||
if (outputSampleRate <= 0) {
|
||||
throw invalid_argument("Sample rate must be positive.");
|
||||
}
|
||||
if (this->inputStream->getSampleRate() < outputSampleRate) {
|
||||
throw invalid_argument(fmt::format("Upsampling not supported. Audio sample rate must not be below {}Hz.", outputSampleRate));
|
||||
}
|
||||
}
|
||||
|
||||
SampleRateConverter::SampleRateConverter(const SampleRateConverter& rhs, bool reset) :
|
||||
SampleRateConverter(rhs.inputStream->clone(reset), outputFrameRate)
|
||||
SampleRateConverter(rhs.inputStream->clone(reset), rhs.outputSampleRate)
|
||||
{
|
||||
nextOutputSampleIndex = reset ? 0 : rhs.nextOutputSampleIndex;
|
||||
}
|
||||
|
@ -30,11 +34,11 @@ std::unique_ptr<AudioStream> SampleRateConverter::clone(bool reset) {
|
|||
}
|
||||
|
||||
int SampleRateConverter::getSampleRate() {
|
||||
return outputFrameRate;
|
||||
return outputSampleRate;
|
||||
}
|
||||
|
||||
int SampleRateConverter::getSampleCount() {
|
||||
return outputFrameCount;
|
||||
return outputSampleCount;
|
||||
}
|
||||
|
||||
int SampleRateConverter::getSampleIndex() {
|
||||
|
@ -42,13 +46,13 @@ int SampleRateConverter::getSampleIndex() {
|
|||
}
|
||||
|
||||
void SampleRateConverter::seek(int sampleIndex) {
|
||||
if (sampleIndex < 0 || sampleIndex >= outputFrameCount) throw std::invalid_argument("sampleIndex out of range.");
|
||||
if (sampleIndex < 0 || sampleIndex >= outputSampleCount) throw std::invalid_argument("sampleIndex out of range.");
|
||||
|
||||
nextOutputSampleIndex = sampleIndex;
|
||||
}
|
||||
|
||||
float SampleRateConverter::readSample() {
|
||||
if (nextOutputSampleIndex >= outputFrameCount) throw std::out_of_range("End of stream.");
|
||||
if (nextOutputSampleIndex >= outputSampleCount) throw std::out_of_range("End of stream.");
|
||||
|
||||
double inputStart = nextOutputSampleIndex * downscalingFactor;
|
||||
double inputEnd = (nextOutputSampleIndex + 1) * downscalingFactor;
|
||||
|
@ -92,3 +96,10 @@ float SampleRateConverter::getInputSample(int sampleIndex) {
|
|||
lastInputSampleIndex = sampleIndex;
|
||||
return lastInputSample;
|
||||
}
|
||||
|
||||
std::unique_ptr<AudioStream> convertSampleRate(std::unique_ptr<AudioStream> audioStream, int sampleRate) {
|
||||
if (sampleRate == audioStream->getSampleRate()) {
|
||||
return audioStream;
|
||||
}
|
||||
return std::make_unique<SampleRateConverter>(std::move(audioStream), sampleRate);
|
||||
}
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
|
||||
class SampleRateConverter : public AudioStream {
|
||||
public:
|
||||
SampleRateConverter(std::unique_ptr<AudioStream> inputStream, int outputFrameRate);
|
||||
SampleRateConverter(std::unique_ptr<AudioStream> inputStream, int outputSampleRate);
|
||||
SampleRateConverter(const SampleRateConverter& rhs, bool reset);
|
||||
std::unique_ptr<AudioStream> clone(bool reset) override;
|
||||
int getSampleRate() override;
|
||||
|
@ -15,10 +15,10 @@ public:
|
|||
float readSample() override;
|
||||
private:
|
||||
std::unique_ptr<AudioStream> inputStream;
|
||||
double downscalingFactor; // input frame rate / output frame rate
|
||||
double downscalingFactor; // input sample rate / output sample rate
|
||||
|
||||
int outputFrameRate;
|
||||
int outputFrameCount;
|
||||
int outputSampleRate;
|
||||
int outputSampleCount;
|
||||
|
||||
float lastInputSample;
|
||||
int lastInputSampleIndex;
|
||||
|
@ -28,3 +28,5 @@ private:
|
|||
float mean(double start, double end);
|
||||
float getInputSample(int sampleIndex);
|
||||
};
|
||||
|
||||
std::unique_ptr<AudioStream> convertSampleRate(std::unique_ptr<AudioStream> audioStream, int sampleRate);
|
|
@ -0,0 +1,59 @@
|
|||
#include "UnboundedStream.h"
|
||||
|
||||
using boost::optional;
|
||||
|
||||
UnboundedStream::UnboundedStream(std::unique_ptr<AudioStream> inputStream) :
|
||||
innerStream(std::move(innerStream)),
|
||||
sampleIndex(innerStream->getSampleIndex()),
|
||||
firstSample(inputStream->getSampleCount() ? optional<float>() : 0.0f),
|
||||
lastSample(inputStream->getSampleCount() ? optional<float>() : 0.0f)
|
||||
{}
|
||||
|
||||
UnboundedStream::UnboundedStream(const UnboundedStream& rhs, bool reset) :
|
||||
innerStream(rhs.innerStream->clone(reset)),
|
||||
sampleIndex(rhs.sampleIndex),
|
||||
firstSample(rhs.firstSample),
|
||||
lastSample(rhs.lastSample)
|
||||
{}
|
||||
|
||||
std::unique_ptr<AudioStream> UnboundedStream::clone(bool reset) {
|
||||
return std::make_unique<UnboundedStream>(*this, reset);
|
||||
}
|
||||
|
||||
int UnboundedStream::getSampleRate() {
|
||||
return innerStream->getSampleRate();
|
||||
}
|
||||
|
||||
int UnboundedStream::getSampleCount() {
|
||||
return innerStream->getSampleCount();
|
||||
}
|
||||
|
||||
int UnboundedStream::getSampleIndex() {
|
||||
return sampleIndex;
|
||||
}
|
||||
|
||||
void UnboundedStream::seek(int sampleIndex) {
|
||||
this->sampleIndex = sampleIndex;
|
||||
}
|
||||
|
||||
float UnboundedStream::readSample() {
|
||||
if (sampleIndex < 0) {
|
||||
if (!firstSample) {
|
||||
innerStream->seek(0);
|
||||
firstSample = innerStream->readSample();
|
||||
}
|
||||
return firstSample.get();
|
||||
}
|
||||
if (sampleIndex >= innerStream->getSampleCount()) {
|
||||
if (!lastSample) {
|
||||
innerStream->seek(innerStream->getSampleCount() - 1);
|
||||
lastSample = innerStream->readSample();
|
||||
}
|
||||
return lastSample.get();
|
||||
}
|
||||
|
||||
if (sampleIndex != innerStream->getSampleIndex()) {
|
||||
innerStream->seek(sampleIndex);
|
||||
}
|
||||
return innerStream->readSample();
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
#pragma once
|
||||
|
||||
#include "AudioStream.h"
|
||||
#include <boost/optional/optional.hpp>
|
||||
|
||||
// Stream wrapper that allows reading before the start and past the end of the input stream.
|
||||
class UnboundedStream : public AudioStream {
|
||||
public:
|
||||
UnboundedStream(std::unique_ptr<AudioStream> inputStream);
|
||||
UnboundedStream(const UnboundedStream& rhs, bool reset);
|
||||
std::unique_ptr<AudioStream> clone(bool reset) override;
|
||||
int getSampleRate() override;
|
||||
int getSampleCount() override;
|
||||
int getSampleIndex() override;
|
||||
void seek(int sampleIndex) override;
|
||||
float readSample() override;
|
||||
|
||||
private:
|
||||
std::unique_ptr<AudioStream> innerStream;
|
||||
int sampleIndex;
|
||||
boost::optional<float> firstSample, lastSample;
|
||||
};
|
|
@ -10,6 +10,7 @@
|
|||
#include <regex>
|
||||
#include <gsl_util.h>
|
||||
#include <logging.h>
|
||||
#include <audio/DCOffset.h>
|
||||
|
||||
extern "C" {
|
||||
#include <pocketsphinx.h>
|
||||
|
@ -32,17 +33,7 @@ using std::regex;
|
|||
using std::regex_replace;
|
||||
using std::chrono::duration;
|
||||
|
||||
unique_ptr<AudioStream> to16kHz(unique_ptr<AudioStream> stream) {
|
||||
// Downsample, if required
|
||||
if (stream->getSampleRate() < 16000) {
|
||||
throw invalid_argument("Audio sample rate must not be below 16kHz.");
|
||||
}
|
||||
if (stream->getSampleRate() != 16000) {
|
||||
stream.reset(new SampleRateConverter(std::move(stream), 16000));
|
||||
}
|
||||
|
||||
return stream;
|
||||
}
|
||||
constexpr int sphinxSampleRate = 16000;
|
||||
|
||||
lambda_unique_ptr<cmd_ln_t> createConfig(path sphinxModelDirectory) {
|
||||
lambda_unique_ptr<cmd_ln_t> config(
|
||||
|
@ -151,7 +142,7 @@ void sphinxLogCallback(void* user_data, err_lvl_t errorLevel, const char* format
|
|||
|
||||
vector<string> recognizeWords(unique_ptr<AudioStream> audioStream, ps_decoder_t& recognizer, ProgressSink& progressSink) {
|
||||
// Convert audio stream to the exact format PocketSphinx requires
|
||||
audioStream = to16kHz(std::move(audioStream));
|
||||
audioStream = convertSampleRate(std::move(audioStream), sphinxSampleRate);
|
||||
|
||||
// Start recognition
|
||||
int error = ps_start_utt(&recognizer);
|
||||
|
@ -236,7 +227,7 @@ map<centiseconds, Phone> getPhoneAlignment(const vector<s3wid_t>& wordIds, uniqu
|
|||
if (error) throw runtime_error("Error populating alignment struct.");
|
||||
|
||||
// Convert audio stream to the exact format PocketSphinx requires
|
||||
audioStream = to16kHz(std::move(audioStream));
|
||||
audioStream = convertSampleRate(std::move(audioStream), sphinxSampleRate);
|
||||
|
||||
// Create search structure
|
||||
acmod_t* acousticModel = recognizer.acmod;
|
||||
|
@ -307,6 +298,9 @@ map<centiseconds, Phone> detectPhones(
|
|||
// Redirect Pocketsphinx output to log
|
||||
err_set_callback(sphinxLogCallback, nullptr);
|
||||
|
||||
// Make sure audio stream has no DC offset
|
||||
audioStream = removeDCOffset(std::move(audioStream));
|
||||
|
||||
try {
|
||||
// Create PocketSphinx configuration
|
||||
path sphinxModelDirectory(getBinDirectory() / "res/sphinx");
|
||||
|
|
Loading…
Reference in New Issue