Removing DC offset from audio
Also a bit of refactoring regarding audio processing
This commit is contained in:
parent
af5a6649c1
commit
a8900f80ec
|
@ -102,7 +102,9 @@ set(SOURCE_FILES
|
||||||
src/platformTools.cpp
|
src/platformTools.cpp
|
||||||
src/tools.cpp
|
src/tools.cpp
|
||||||
src/audio/AudioStream.cpp
|
src/audio/AudioStream.cpp
|
||||||
|
src/audio/DCOffset.cpp
|
||||||
src/audio/SampleRateConverter.cpp
|
src/audio/SampleRateConverter.cpp
|
||||||
|
src/audio/UnboundedStream.cpp
|
||||||
src/audio/WaveFileReader.cpp
|
src/audio/WaveFileReader.cpp
|
||||||
src/audio/waveFileWriting.cpp
|
src/audio/waveFileWriting.cpp
|
||||||
src/stringTools.cpp
|
src/stringTools.cpp
|
||||||
|
|
|
@ -0,0 +1,80 @@
|
||||||
|
#include "DCOffset.h"
|
||||||
|
#include <gsl_util.h>
|
||||||
|
|
||||||
|
DCOffset::DCOffset(std::unique_ptr<AudioStream> inputStream, float offset) :
|
||||||
|
inputStream(std::move(inputStream)),
|
||||||
|
offset(offset),
|
||||||
|
factor(1 / (1 + std::abs(offset)))
|
||||||
|
{}
|
||||||
|
|
||||||
|
DCOffset::DCOffset(const DCOffset& rhs, bool reset) :
|
||||||
|
inputStream(rhs.inputStream->clone(reset)),
|
||||||
|
offset(rhs.offset),
|
||||||
|
factor(rhs.factor)
|
||||||
|
{}
|
||||||
|
|
||||||
|
std::unique_ptr<AudioStream> DCOffset::clone(bool reset) {
|
||||||
|
return std::make_unique<DCOffset>(*this, reset);
|
||||||
|
}
|
||||||
|
|
||||||
|
int DCOffset::getSampleRate() {
|
||||||
|
return inputStream->getSampleRate();
|
||||||
|
}
|
||||||
|
|
||||||
|
int DCOffset::getSampleCount() {
|
||||||
|
return inputStream->getSampleCount();
|
||||||
|
}
|
||||||
|
|
||||||
|
int DCOffset::getSampleIndex() {
|
||||||
|
return inputStream->getSampleIndex();
|
||||||
|
}
|
||||||
|
|
||||||
|
void DCOffset::seek(int sampleIndex) {
|
||||||
|
inputStream->seek(sampleIndex);
|
||||||
|
}
|
||||||
|
|
||||||
|
float DCOffset::readSample() {
|
||||||
|
float sample = inputStream->readSample();
|
||||||
|
return sample * factor + offset;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::unique_ptr<AudioStream> addDCOffset(std::unique_ptr<AudioStream> audioStream, float offset, float epsilon) {
|
||||||
|
if (std::abs(offset) < epsilon) return audioStream;
|
||||||
|
return std::make_unique<DCOffset>(std::move(audioStream), offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
float getDCOffset(AudioStream& audioStream) {
|
||||||
|
int flatMeanSampleCount, fadingMeanSampleCount;
|
||||||
|
int sampleRate = audioStream.getSampleRate();
|
||||||
|
if (audioStream.getSampleCount() > 4 * sampleRate) {
|
||||||
|
// Long audio file. Average over the first 3 seconds, then fade out over the 4th.
|
||||||
|
flatMeanSampleCount = 3 * sampleRate;
|
||||||
|
fadingMeanSampleCount = 1 * sampleRate;
|
||||||
|
} else {
|
||||||
|
// Short audio file. Average over the entire length.
|
||||||
|
flatMeanSampleCount = audioStream.getSampleCount();
|
||||||
|
fadingMeanSampleCount = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int originalSampleIndex = audioStream.getSampleIndex();
|
||||||
|
audioStream.seek(0);
|
||||||
|
auto restorePosition = gsl::finally([&]() { audioStream.seek(originalSampleIndex); });
|
||||||
|
|
||||||
|
double sum = 0;
|
||||||
|
for (int i = 0; i < flatMeanSampleCount; i++) {
|
||||||
|
sum += audioStream.readSample();
|
||||||
|
}
|
||||||
|
for (int i = 0; i < fadingMeanSampleCount; i++) {
|
||||||
|
double weight = static_cast<double>(fadingMeanSampleCount - i) / fadingMeanSampleCount;
|
||||||
|
sum += audioStream.readSample() * weight;
|
||||||
|
}
|
||||||
|
|
||||||
|
double totalWeight = flatMeanSampleCount + (fadingMeanSampleCount + 1) / 2.0;
|
||||||
|
double offset = sum / totalWeight;
|
||||||
|
return static_cast<float>(offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::unique_ptr<AudioStream> removeDCOffset(std::unique_ptr<AudioStream> inputStream) {
|
||||||
|
float offset = getDCOffset(*inputStream.get());
|
||||||
|
return addDCOffset(std::move(inputStream), -offset);
|
||||||
|
}
|
|
@ -0,0 +1,26 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "AudioStream.h"
|
||||||
|
|
||||||
|
// Applies a constant DC offset to an audio stream and reduces its amplitude
|
||||||
|
// to prevent clipping
|
||||||
|
class DCOffset : public AudioStream {
|
||||||
|
public:
|
||||||
|
DCOffset(std::unique_ptr<AudioStream> inputStream, float offset);
|
||||||
|
DCOffset(const DCOffset& rhs, bool reset);
|
||||||
|
std::unique_ptr<AudioStream> clone(bool reset) override;
|
||||||
|
int getSampleRate() override;
|
||||||
|
int getSampleCount() override;
|
||||||
|
int getSampleIndex() override;
|
||||||
|
void seek(int sampleIndex) override;
|
||||||
|
float readSample() override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::unique_ptr<AudioStream> inputStream;
|
||||||
|
float offset;
|
||||||
|
float factor;
|
||||||
|
};
|
||||||
|
|
||||||
|
std::unique_ptr<AudioStream> addDCOffset(std::unique_ptr<AudioStream> audioStream, float offset, float epsilon = 1.0f / 15000);
|
||||||
|
|
||||||
|
std::unique_ptr<AudioStream> removeDCOffset(std::unique_ptr<AudioStream> audioStream);
|
|
@ -2,25 +2,29 @@
|
||||||
#include "SampleRateConverter.h"
|
#include "SampleRateConverter.h"
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <format.h>
|
||||||
|
|
||||||
using std::runtime_error;
|
using std::invalid_argument;
|
||||||
|
|
||||||
SampleRateConverter::SampleRateConverter(std::unique_ptr<AudioStream> inputStream, int outputFrameRate) :
|
SampleRateConverter::SampleRateConverter(std::unique_ptr<AudioStream> inputStream, int outputSampleRate) :
|
||||||
inputStream(std::move(inputStream)),
|
inputStream(std::move(inputStream)),
|
||||||
downscalingFactor(static_cast<double>(this->inputStream->getSampleRate()) / outputFrameRate),
|
downscalingFactor(static_cast<double>(this->inputStream->getSampleRate()) / outputSampleRate),
|
||||||
outputFrameRate(outputFrameRate),
|
outputSampleRate(outputSampleRate),
|
||||||
outputFrameCount(std::lround(this->inputStream->getSampleCount() / downscalingFactor)),
|
outputSampleCount(std::lround(this->inputStream->getSampleCount() / downscalingFactor)),
|
||||||
lastInputSample(0),
|
lastInputSample(0),
|
||||||
lastInputSampleIndex(-1),
|
lastInputSampleIndex(-1),
|
||||||
nextOutputSampleIndex(0)
|
nextOutputSampleIndex(0)
|
||||||
{
|
{
|
||||||
if (this->inputStream->getSampleRate() < outputFrameRate) {
|
if (outputSampleRate <= 0) {
|
||||||
throw runtime_error("Upsampling not supported.");
|
throw invalid_argument("Sample rate must be positive.");
|
||||||
|
}
|
||||||
|
if (this->inputStream->getSampleRate() < outputSampleRate) {
|
||||||
|
throw invalid_argument(fmt::format("Upsampling not supported. Audio sample rate must not be below {}Hz.", outputSampleRate));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
SampleRateConverter::SampleRateConverter(const SampleRateConverter& rhs, bool reset) :
|
SampleRateConverter::SampleRateConverter(const SampleRateConverter& rhs, bool reset) :
|
||||||
SampleRateConverter(rhs.inputStream->clone(reset), outputFrameRate)
|
SampleRateConverter(rhs.inputStream->clone(reset), rhs.outputSampleRate)
|
||||||
{
|
{
|
||||||
nextOutputSampleIndex = reset ? 0 : rhs.nextOutputSampleIndex;
|
nextOutputSampleIndex = reset ? 0 : rhs.nextOutputSampleIndex;
|
||||||
}
|
}
|
||||||
|
@ -30,11 +34,11 @@ std::unique_ptr<AudioStream> SampleRateConverter::clone(bool reset) {
|
||||||
}
|
}
|
||||||
|
|
||||||
int SampleRateConverter::getSampleRate() {
|
int SampleRateConverter::getSampleRate() {
|
||||||
return outputFrameRate;
|
return outputSampleRate;
|
||||||
}
|
}
|
||||||
|
|
||||||
int SampleRateConverter::getSampleCount() {
|
int SampleRateConverter::getSampleCount() {
|
||||||
return outputFrameCount;
|
return outputSampleCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
int SampleRateConverter::getSampleIndex() {
|
int SampleRateConverter::getSampleIndex() {
|
||||||
|
@ -42,13 +46,13 @@ int SampleRateConverter::getSampleIndex() {
|
||||||
}
|
}
|
||||||
|
|
||||||
void SampleRateConverter::seek(int sampleIndex) {
|
void SampleRateConverter::seek(int sampleIndex) {
|
||||||
if (sampleIndex < 0 || sampleIndex >= outputFrameCount) throw std::invalid_argument("sampleIndex out of range.");
|
if (sampleIndex < 0 || sampleIndex >= outputSampleCount) throw std::invalid_argument("sampleIndex out of range.");
|
||||||
|
|
||||||
nextOutputSampleIndex = sampleIndex;
|
nextOutputSampleIndex = sampleIndex;
|
||||||
}
|
}
|
||||||
|
|
||||||
float SampleRateConverter::readSample() {
|
float SampleRateConverter::readSample() {
|
||||||
if (nextOutputSampleIndex >= outputFrameCount) throw std::out_of_range("End of stream.");
|
if (nextOutputSampleIndex >= outputSampleCount) throw std::out_of_range("End of stream.");
|
||||||
|
|
||||||
double inputStart = nextOutputSampleIndex * downscalingFactor;
|
double inputStart = nextOutputSampleIndex * downscalingFactor;
|
||||||
double inputEnd = (nextOutputSampleIndex + 1) * downscalingFactor;
|
double inputEnd = (nextOutputSampleIndex + 1) * downscalingFactor;
|
||||||
|
@ -92,3 +96,10 @@ float SampleRateConverter::getInputSample(int sampleIndex) {
|
||||||
lastInputSampleIndex = sampleIndex;
|
lastInputSampleIndex = sampleIndex;
|
||||||
return lastInputSample;
|
return lastInputSample;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::unique_ptr<AudioStream> convertSampleRate(std::unique_ptr<AudioStream> audioStream, int sampleRate) {
|
||||||
|
if (sampleRate == audioStream->getSampleRate()) {
|
||||||
|
return audioStream;
|
||||||
|
}
|
||||||
|
return std::make_unique<SampleRateConverter>(std::move(audioStream), sampleRate);
|
||||||
|
}
|
||||||
|
|
|
@ -5,7 +5,7 @@
|
||||||
|
|
||||||
class SampleRateConverter : public AudioStream {
|
class SampleRateConverter : public AudioStream {
|
||||||
public:
|
public:
|
||||||
SampleRateConverter(std::unique_ptr<AudioStream> inputStream, int outputFrameRate);
|
SampleRateConverter(std::unique_ptr<AudioStream> inputStream, int outputSampleRate);
|
||||||
SampleRateConverter(const SampleRateConverter& rhs, bool reset);
|
SampleRateConverter(const SampleRateConverter& rhs, bool reset);
|
||||||
std::unique_ptr<AudioStream> clone(bool reset) override;
|
std::unique_ptr<AudioStream> clone(bool reset) override;
|
||||||
int getSampleRate() override;
|
int getSampleRate() override;
|
||||||
|
@ -15,10 +15,10 @@ public:
|
||||||
float readSample() override;
|
float readSample() override;
|
||||||
private:
|
private:
|
||||||
std::unique_ptr<AudioStream> inputStream;
|
std::unique_ptr<AudioStream> inputStream;
|
||||||
double downscalingFactor; // input frame rate / output frame rate
|
double downscalingFactor; // input sample rate / output sample rate
|
||||||
|
|
||||||
int outputFrameRate;
|
int outputSampleRate;
|
||||||
int outputFrameCount;
|
int outputSampleCount;
|
||||||
|
|
||||||
float lastInputSample;
|
float lastInputSample;
|
||||||
int lastInputSampleIndex;
|
int lastInputSampleIndex;
|
||||||
|
@ -28,3 +28,5 @@ private:
|
||||||
float mean(double start, double end);
|
float mean(double start, double end);
|
||||||
float getInputSample(int sampleIndex);
|
float getInputSample(int sampleIndex);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
std::unique_ptr<AudioStream> convertSampleRate(std::unique_ptr<AudioStream> audioStream, int sampleRate);
|
|
@ -0,0 +1,59 @@
|
||||||
|
#include "UnboundedStream.h"
|
||||||
|
|
||||||
|
using boost::optional;
|
||||||
|
|
||||||
|
UnboundedStream::UnboundedStream(std::unique_ptr<AudioStream> inputStream) :
|
||||||
|
innerStream(std::move(innerStream)),
|
||||||
|
sampleIndex(innerStream->getSampleIndex()),
|
||||||
|
firstSample(inputStream->getSampleCount() ? optional<float>() : 0.0f),
|
||||||
|
lastSample(inputStream->getSampleCount() ? optional<float>() : 0.0f)
|
||||||
|
{}
|
||||||
|
|
||||||
|
UnboundedStream::UnboundedStream(const UnboundedStream& rhs, bool reset) :
|
||||||
|
innerStream(rhs.innerStream->clone(reset)),
|
||||||
|
sampleIndex(rhs.sampleIndex),
|
||||||
|
firstSample(rhs.firstSample),
|
||||||
|
lastSample(rhs.lastSample)
|
||||||
|
{}
|
||||||
|
|
||||||
|
std::unique_ptr<AudioStream> UnboundedStream::clone(bool reset) {
|
||||||
|
return std::make_unique<UnboundedStream>(*this, reset);
|
||||||
|
}
|
||||||
|
|
||||||
|
int UnboundedStream::getSampleRate() {
|
||||||
|
return innerStream->getSampleRate();
|
||||||
|
}
|
||||||
|
|
||||||
|
int UnboundedStream::getSampleCount() {
|
||||||
|
return innerStream->getSampleCount();
|
||||||
|
}
|
||||||
|
|
||||||
|
int UnboundedStream::getSampleIndex() {
|
||||||
|
return sampleIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
void UnboundedStream::seek(int sampleIndex) {
|
||||||
|
this->sampleIndex = sampleIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
float UnboundedStream::readSample() {
|
||||||
|
if (sampleIndex < 0) {
|
||||||
|
if (!firstSample) {
|
||||||
|
innerStream->seek(0);
|
||||||
|
firstSample = innerStream->readSample();
|
||||||
|
}
|
||||||
|
return firstSample.get();
|
||||||
|
}
|
||||||
|
if (sampleIndex >= innerStream->getSampleCount()) {
|
||||||
|
if (!lastSample) {
|
||||||
|
innerStream->seek(innerStream->getSampleCount() - 1);
|
||||||
|
lastSample = innerStream->readSample();
|
||||||
|
}
|
||||||
|
return lastSample.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sampleIndex != innerStream->getSampleIndex()) {
|
||||||
|
innerStream->seek(sampleIndex);
|
||||||
|
}
|
||||||
|
return innerStream->readSample();
|
||||||
|
}
|
|
@ -0,0 +1,22 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "AudioStream.h"
|
||||||
|
#include <boost/optional/optional.hpp>
|
||||||
|
|
||||||
|
// Stream wrapper that allows reading before the start and past the end of the input stream.
|
||||||
|
class UnboundedStream : public AudioStream {
|
||||||
|
public:
|
||||||
|
UnboundedStream(std::unique_ptr<AudioStream> inputStream);
|
||||||
|
UnboundedStream(const UnboundedStream& rhs, bool reset);
|
||||||
|
std::unique_ptr<AudioStream> clone(bool reset) override;
|
||||||
|
int getSampleRate() override;
|
||||||
|
int getSampleCount() override;
|
||||||
|
int getSampleIndex() override;
|
||||||
|
void seek(int sampleIndex) override;
|
||||||
|
float readSample() override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::unique_ptr<AudioStream> innerStream;
|
||||||
|
int sampleIndex;
|
||||||
|
boost::optional<float> firstSample, lastSample;
|
||||||
|
};
|
|
@ -10,6 +10,7 @@
|
||||||
#include <regex>
|
#include <regex>
|
||||||
#include <gsl_util.h>
|
#include <gsl_util.h>
|
||||||
#include <logging.h>
|
#include <logging.h>
|
||||||
|
#include <audio/DCOffset.h>
|
||||||
|
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#include <pocketsphinx.h>
|
#include <pocketsphinx.h>
|
||||||
|
@ -32,17 +33,7 @@ using std::regex;
|
||||||
using std::regex_replace;
|
using std::regex_replace;
|
||||||
using std::chrono::duration;
|
using std::chrono::duration;
|
||||||
|
|
||||||
unique_ptr<AudioStream> to16kHz(unique_ptr<AudioStream> stream) {
|
constexpr int sphinxSampleRate = 16000;
|
||||||
// Downsample, if required
|
|
||||||
if (stream->getSampleRate() < 16000) {
|
|
||||||
throw invalid_argument("Audio sample rate must not be below 16kHz.");
|
|
||||||
}
|
|
||||||
if (stream->getSampleRate() != 16000) {
|
|
||||||
stream.reset(new SampleRateConverter(std::move(stream), 16000));
|
|
||||||
}
|
|
||||||
|
|
||||||
return stream;
|
|
||||||
}
|
|
||||||
|
|
||||||
lambda_unique_ptr<cmd_ln_t> createConfig(path sphinxModelDirectory) {
|
lambda_unique_ptr<cmd_ln_t> createConfig(path sphinxModelDirectory) {
|
||||||
lambda_unique_ptr<cmd_ln_t> config(
|
lambda_unique_ptr<cmd_ln_t> config(
|
||||||
|
@ -151,7 +142,7 @@ void sphinxLogCallback(void* user_data, err_lvl_t errorLevel, const char* format
|
||||||
|
|
||||||
vector<string> recognizeWords(unique_ptr<AudioStream> audioStream, ps_decoder_t& recognizer, ProgressSink& progressSink) {
|
vector<string> recognizeWords(unique_ptr<AudioStream> audioStream, ps_decoder_t& recognizer, ProgressSink& progressSink) {
|
||||||
// Convert audio stream to the exact format PocketSphinx requires
|
// Convert audio stream to the exact format PocketSphinx requires
|
||||||
audioStream = to16kHz(std::move(audioStream));
|
audioStream = convertSampleRate(std::move(audioStream), sphinxSampleRate);
|
||||||
|
|
||||||
// Start recognition
|
// Start recognition
|
||||||
int error = ps_start_utt(&recognizer);
|
int error = ps_start_utt(&recognizer);
|
||||||
|
@ -236,7 +227,7 @@ map<centiseconds, Phone> getPhoneAlignment(const vector<s3wid_t>& wordIds, uniqu
|
||||||
if (error) throw runtime_error("Error populating alignment struct.");
|
if (error) throw runtime_error("Error populating alignment struct.");
|
||||||
|
|
||||||
// Convert audio stream to the exact format PocketSphinx requires
|
// Convert audio stream to the exact format PocketSphinx requires
|
||||||
audioStream = to16kHz(std::move(audioStream));
|
audioStream = convertSampleRate(std::move(audioStream), sphinxSampleRate);
|
||||||
|
|
||||||
// Create search structure
|
// Create search structure
|
||||||
acmod_t* acousticModel = recognizer.acmod;
|
acmod_t* acousticModel = recognizer.acmod;
|
||||||
|
@ -307,6 +298,9 @@ map<centiseconds, Phone> detectPhones(
|
||||||
// Redirect Pocketsphinx output to log
|
// Redirect Pocketsphinx output to log
|
||||||
err_set_callback(sphinxLogCallback, nullptr);
|
err_set_callback(sphinxLogCallback, nullptr);
|
||||||
|
|
||||||
|
// Make sure audio stream has no DC offset
|
||||||
|
audioStream = removeDCOffset(std::move(audioStream));
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Create PocketSphinx configuration
|
// Create PocketSphinx configuration
|
||||||
path sphinxModelDirectory(getBinDirectory() / "res/sphinx");
|
path sphinxModelDirectory(getBinDirectory() / "res/sphinx");
|
||||||
|
|
Loading…
Reference in New Issue