Removing DC offset from audio

Also a bit of refactoring regarding audio processing
This commit is contained in:
Daniel Wolf 2016-03-15 19:56:02 +01:00
parent af5a6649c1
commit a8900f80ec
8 changed files with 225 additions and 29 deletions

View File

@ -102,7 +102,9 @@ set(SOURCE_FILES
src/platformTools.cpp
src/tools.cpp
src/audio/AudioStream.cpp
src/audio/DCOffset.cpp
src/audio/SampleRateConverter.cpp
src/audio/UnboundedStream.cpp
src/audio/WaveFileReader.cpp
src/audio/waveFileWriting.cpp
src/stringTools.cpp

80
src/audio/DCOffset.cpp Normal file
View File

@ -0,0 +1,80 @@
#include "DCOffset.h"
#include <gsl_util.h>
DCOffset::DCOffset(std::unique_ptr<AudioStream> inputStream, float offset) :
inputStream(std::move(inputStream)),
offset(offset),
factor(1 / (1 + std::abs(offset)))
{}
DCOffset::DCOffset(const DCOffset& rhs, bool reset) :
inputStream(rhs.inputStream->clone(reset)),
offset(rhs.offset),
factor(rhs.factor)
{}
std::unique_ptr<AudioStream> DCOffset::clone(bool reset) {
return std::make_unique<DCOffset>(*this, reset);
}
int DCOffset::getSampleRate() {
return inputStream->getSampleRate();
}
int DCOffset::getSampleCount() {
return inputStream->getSampleCount();
}
int DCOffset::getSampleIndex() {
return inputStream->getSampleIndex();
}
void DCOffset::seek(int sampleIndex) {
inputStream->seek(sampleIndex);
}
float DCOffset::readSample() {
float sample = inputStream->readSample();
return sample * factor + offset;
}
std::unique_ptr<AudioStream> addDCOffset(std::unique_ptr<AudioStream> audioStream, float offset, float epsilon) {
if (std::abs(offset) < epsilon) return audioStream;
return std::make_unique<DCOffset>(std::move(audioStream), offset);
}
float getDCOffset(AudioStream& audioStream) {
int flatMeanSampleCount, fadingMeanSampleCount;
int sampleRate = audioStream.getSampleRate();
if (audioStream.getSampleCount() > 4 * sampleRate) {
// Long audio file. Average over the first 3 seconds, then fade out over the 4th.
flatMeanSampleCount = 3 * sampleRate;
fadingMeanSampleCount = 1 * sampleRate;
} else {
// Short audio file. Average over the entire length.
flatMeanSampleCount = audioStream.getSampleCount();
fadingMeanSampleCount = 0;
}
int originalSampleIndex = audioStream.getSampleIndex();
audioStream.seek(0);
auto restorePosition = gsl::finally([&]() { audioStream.seek(originalSampleIndex); });
double sum = 0;
for (int i = 0; i < flatMeanSampleCount; i++) {
sum += audioStream.readSample();
}
for (int i = 0; i < fadingMeanSampleCount; i++) {
double weight = static_cast<double>(fadingMeanSampleCount - i) / fadingMeanSampleCount;
sum += audioStream.readSample() * weight;
}
double totalWeight = flatMeanSampleCount + (fadingMeanSampleCount + 1) / 2.0;
double offset = sum / totalWeight;
return static_cast<float>(offset);
}
std::unique_ptr<AudioStream> removeDCOffset(std::unique_ptr<AudioStream> inputStream) {
float offset = getDCOffset(*inputStream.get());
return addDCOffset(std::move(inputStream), -offset);
}

26
src/audio/DCOffset.h Normal file
View File

@ -0,0 +1,26 @@
#pragma once
#include "AudioStream.h"
// Applies a constant DC offset to an audio stream and reduces its amplitude
// to prevent clipping
class DCOffset : public AudioStream {
public:
DCOffset(std::unique_ptr<AudioStream> inputStream, float offset);
DCOffset(const DCOffset& rhs, bool reset);
std::unique_ptr<AudioStream> clone(bool reset) override;
int getSampleRate() override;
int getSampleCount() override;
int getSampleIndex() override;
void seek(int sampleIndex) override;
float readSample() override;
private:
std::unique_ptr<AudioStream> inputStream;
float offset;
float factor;
};
std::unique_ptr<AudioStream> addDCOffset(std::unique_ptr<AudioStream> audioStream, float offset, float epsilon = 1.0f / 15000);
std::unique_ptr<AudioStream> removeDCOffset(std::unique_ptr<AudioStream> audioStream);

View File

@ -2,25 +2,29 @@
#include "SampleRateConverter.h"
#include <stdexcept>
#include <algorithm>
#include <format.h>
using std::runtime_error;
using std::invalid_argument;
SampleRateConverter::SampleRateConverter(std::unique_ptr<AudioStream> inputStream, int outputFrameRate) :
SampleRateConverter::SampleRateConverter(std::unique_ptr<AudioStream> inputStream, int outputSampleRate) :
inputStream(std::move(inputStream)),
downscalingFactor(static_cast<double>(this->inputStream->getSampleRate()) / outputFrameRate),
outputFrameRate(outputFrameRate),
outputFrameCount(std::lround(this->inputStream->getSampleCount() / downscalingFactor)),
downscalingFactor(static_cast<double>(this->inputStream->getSampleRate()) / outputSampleRate),
outputSampleRate(outputSampleRate),
outputSampleCount(std::lround(this->inputStream->getSampleCount() / downscalingFactor)),
lastInputSample(0),
lastInputSampleIndex(-1),
nextOutputSampleIndex(0)
{
if (this->inputStream->getSampleRate() < outputFrameRate) {
throw runtime_error("Upsampling not supported.");
if (outputSampleRate <= 0) {
throw invalid_argument("Sample rate must be positive.");
}
if (this->inputStream->getSampleRate() < outputSampleRate) {
throw invalid_argument(fmt::format("Upsampling not supported. Audio sample rate must not be below {}Hz.", outputSampleRate));
}
}
SampleRateConverter::SampleRateConverter(const SampleRateConverter& rhs, bool reset) :
SampleRateConverter(rhs.inputStream->clone(reset), outputFrameRate)
SampleRateConverter(rhs.inputStream->clone(reset), rhs.outputSampleRate)
{
nextOutputSampleIndex = reset ? 0 : rhs.nextOutputSampleIndex;
}
@ -30,11 +34,11 @@ std::unique_ptr<AudioStream> SampleRateConverter::clone(bool reset) {
}
int SampleRateConverter::getSampleRate() {
return outputFrameRate;
return outputSampleRate;
}
int SampleRateConverter::getSampleCount() {
return outputFrameCount;
return outputSampleCount;
}
int SampleRateConverter::getSampleIndex() {
@ -42,13 +46,13 @@ int SampleRateConverter::getSampleIndex() {
}
void SampleRateConverter::seek(int sampleIndex) {
if (sampleIndex < 0 || sampleIndex >= outputFrameCount) throw std::invalid_argument("sampleIndex out of range.");
if (sampleIndex < 0 || sampleIndex >= outputSampleCount) throw std::invalid_argument("sampleIndex out of range.");
nextOutputSampleIndex = sampleIndex;
}
float SampleRateConverter::readSample() {
if (nextOutputSampleIndex >= outputFrameCount) throw std::out_of_range("End of stream.");
if (nextOutputSampleIndex >= outputSampleCount) throw std::out_of_range("End of stream.");
double inputStart = nextOutputSampleIndex * downscalingFactor;
double inputEnd = (nextOutputSampleIndex + 1) * downscalingFactor;
@ -92,3 +96,10 @@ float SampleRateConverter::getInputSample(int sampleIndex) {
lastInputSampleIndex = sampleIndex;
return lastInputSample;
}
std::unique_ptr<AudioStream> convertSampleRate(std::unique_ptr<AudioStream> audioStream, int sampleRate) {
if (sampleRate == audioStream->getSampleRate()) {
return audioStream;
}
return std::make_unique<SampleRateConverter>(std::move(audioStream), sampleRate);
}

View File

@ -5,7 +5,7 @@
class SampleRateConverter : public AudioStream {
public:
SampleRateConverter(std::unique_ptr<AudioStream> inputStream, int outputFrameRate);
SampleRateConverter(std::unique_ptr<AudioStream> inputStream, int outputSampleRate);
SampleRateConverter(const SampleRateConverter& rhs, bool reset);
std::unique_ptr<AudioStream> clone(bool reset) override;
int getSampleRate() override;
@ -15,10 +15,10 @@ public:
float readSample() override;
private:
std::unique_ptr<AudioStream> inputStream;
double downscalingFactor; // input frame rate / output frame rate
double downscalingFactor; // input sample rate / output sample rate
int outputFrameRate;
int outputFrameCount;
int outputSampleRate;
int outputSampleCount;
float lastInputSample;
int lastInputSampleIndex;
@ -28,3 +28,5 @@ private:
float mean(double start, double end);
float getInputSample(int sampleIndex);
};
std::unique_ptr<AudioStream> convertSampleRate(std::unique_ptr<AudioStream> audioStream, int sampleRate);

View File

@ -0,0 +1,59 @@
#include "UnboundedStream.h"
using boost::optional;
UnboundedStream::UnboundedStream(std::unique_ptr<AudioStream> inputStream) :
innerStream(std::move(innerStream)),
sampleIndex(innerStream->getSampleIndex()),
firstSample(inputStream->getSampleCount() ? optional<float>() : 0.0f),
lastSample(inputStream->getSampleCount() ? optional<float>() : 0.0f)
{}
UnboundedStream::UnboundedStream(const UnboundedStream& rhs, bool reset) :
innerStream(rhs.innerStream->clone(reset)),
sampleIndex(rhs.sampleIndex),
firstSample(rhs.firstSample),
lastSample(rhs.lastSample)
{}
std::unique_ptr<AudioStream> UnboundedStream::clone(bool reset) {
return std::make_unique<UnboundedStream>(*this, reset);
}
int UnboundedStream::getSampleRate() {
return innerStream->getSampleRate();
}
int UnboundedStream::getSampleCount() {
return innerStream->getSampleCount();
}
int UnboundedStream::getSampleIndex() {
return sampleIndex;
}
void UnboundedStream::seek(int sampleIndex) {
this->sampleIndex = sampleIndex;
}
float UnboundedStream::readSample() {
if (sampleIndex < 0) {
if (!firstSample) {
innerStream->seek(0);
firstSample = innerStream->readSample();
}
return firstSample.get();
}
if (sampleIndex >= innerStream->getSampleCount()) {
if (!lastSample) {
innerStream->seek(innerStream->getSampleCount() - 1);
lastSample = innerStream->readSample();
}
return lastSample.get();
}
if (sampleIndex != innerStream->getSampleIndex()) {
innerStream->seek(sampleIndex);
}
return innerStream->readSample();
}

View File

@ -0,0 +1,22 @@
#pragma once
#include "AudioStream.h"
#include <boost/optional/optional.hpp>
// Stream wrapper that allows reading before the start and past the end of the input stream.
class UnboundedStream : public AudioStream {
public:
UnboundedStream(std::unique_ptr<AudioStream> inputStream);
UnboundedStream(const UnboundedStream& rhs, bool reset);
std::unique_ptr<AudioStream> clone(bool reset) override;
int getSampleRate() override;
int getSampleCount() override;
int getSampleIndex() override;
void seek(int sampleIndex) override;
float readSample() override;
private:
std::unique_ptr<AudioStream> innerStream;
int sampleIndex;
boost::optional<float> firstSample, lastSample;
};

View File

@ -10,6 +10,7 @@
#include <regex>
#include <gsl_util.h>
#include <logging.h>
#include <audio/DCOffset.h>
extern "C" {
#include <pocketsphinx.h>
@ -32,17 +33,7 @@ using std::regex;
using std::regex_replace;
using std::chrono::duration;
unique_ptr<AudioStream> to16kHz(unique_ptr<AudioStream> stream) {
// Downsample, if required
if (stream->getSampleRate() < 16000) {
throw invalid_argument("Audio sample rate must not be below 16kHz.");
}
if (stream->getSampleRate() != 16000) {
stream.reset(new SampleRateConverter(std::move(stream), 16000));
}
return stream;
}
constexpr int sphinxSampleRate = 16000;
lambda_unique_ptr<cmd_ln_t> createConfig(path sphinxModelDirectory) {
lambda_unique_ptr<cmd_ln_t> config(
@ -151,7 +142,7 @@ void sphinxLogCallback(void* user_data, err_lvl_t errorLevel, const char* format
vector<string> recognizeWords(unique_ptr<AudioStream> audioStream, ps_decoder_t& recognizer, ProgressSink& progressSink) {
// Convert audio stream to the exact format PocketSphinx requires
audioStream = to16kHz(std::move(audioStream));
audioStream = convertSampleRate(std::move(audioStream), sphinxSampleRate);
// Start recognition
int error = ps_start_utt(&recognizer);
@ -236,7 +227,7 @@ map<centiseconds, Phone> getPhoneAlignment(const vector<s3wid_t>& wordIds, uniqu
if (error) throw runtime_error("Error populating alignment struct.");
// Convert audio stream to the exact format PocketSphinx requires
audioStream = to16kHz(std::move(audioStream));
audioStream = convertSampleRate(std::move(audioStream), sphinxSampleRate);
// Create search structure
acmod_t* acousticModel = recognizer.acmod;
@ -307,6 +298,9 @@ map<centiseconds, Phone> detectPhones(
// Redirect Pocketsphinx output to log
err_set_callback(sphinxLogCallback, nullptr);
// Make sure audio stream has no DC offset
audioStream = removeDCOffset(std::move(audioStream));
try {
// Create PocketSphinx configuration
path sphinxModelDirectory(getBinDirectory() / "res/sphinx");