Refactored audio handling

Now audio clips can be passed around as const references
and don't carry state any more.
This commit is contained in:
Daniel Wolf 2016-07-20 20:42:27 +02:00
parent 799f334fa7
commit 26cae93478
26 changed files with 496 additions and 499 deletions

View File

@ -197,11 +197,10 @@ set(SOURCE_FILES
src/phoneExtraction.cpp src/phoneExtraction.h
src/platformTools.cpp src/platformTools.h
src/tools.cpp src/tools.h
src/audio/AudioStream.cpp src/audio/AudioStream.h
src/audio/AudioStreamSegment.cpp src/audio/AudioStreamSegment.h
src/audio/AudioClip.cpp src/audio/AudioClip.h
src/audio/AudioSegment.cpp src/audio/AudioSegment.h
src/audio/DCOffset.cpp src/audio/DCOffset.h
src/audio/SampleRateConverter.cpp src/audio/SampleRateConverter.h
src/audio/UnboundedStream.cpp src/audio/UnboundedStream.h
src/audio/voiceActivityDetection.cpp src/audio/voiceActivityDetection.h
src/audio/WaveFileReader.cpp src/audio/WaveFileReader.h
src/audio/waveFileWriting.cpp src/audio/waveFileWriting.h

65
src/audio/AudioClip.cpp Normal file
View File

@ -0,0 +1,65 @@
#include "AudioClip.h"
#include <format.h>
using std::invalid_argument;
TimeRange AudioClip::getTruncatedRange() const {
return TimeRange(0cs, centiseconds(100 * size() / getSampleRate()));
}
class SafeSampleReader {
public:
SafeSampleReader(SampleReader unsafeRead, AudioClip::size_type size);
AudioClip::value_type operator()(AudioClip::size_type index);
private:
SampleReader unsafeRead;
AudioClip::size_type size;
AudioClip::size_type lastIndex = -1;
AudioClip::value_type lastSample = 0;
};
SafeSampleReader::SafeSampleReader(SampleReader unsafeRead, AudioClip::size_type size) :
unsafeRead(unsafeRead),
size(size)
{}
inline AudioClip::value_type SafeSampleReader::operator()(AudioClip::size_type index) {
if (index < 0) {
throw invalid_argument(fmt::format("Cannot read from sample index {}. Index < 0.", index));
}
if (index >= size) {
throw invalid_argument(fmt::format("Cannot read from sample index {}. Clip size is {}.", index, size));
}
if (index == lastIndex) {
return lastSample;
}
lastIndex = index;
lastSample = unsafeRead(index);
return lastSample;
}
SampleReader AudioClip::createSampleReader() const {
return SafeSampleReader(createUnsafeSampleReader(), size());
}
AudioClip::iterator AudioClip::begin() const {
return SampleIterator(*this, 0);
}
AudioClip::iterator AudioClip::end() const {
return SampleIterator(*this, size());
}
std::unique_ptr<AudioClip> operator|(std::unique_ptr<AudioClip> clip, AudioEffect effect) {
return effect(std::move(clip));
}
SampleIterator::SampleIterator() :
sampleIndex(0)
{}
SampleIterator::SampleIterator(const AudioClip& audioClip, size_type sampleIndex) :
sampleReader([&audioClip] { return audioClip.createSampleReader(); }),
sampleIndex(sampleIndex)
{}

141
src/audio/AudioClip.h Normal file
View File

@ -0,0 +1,141 @@
#pragma once
#include <memory>
#include "TimeRange.h"
#include <functional>
#include "Lazy.h"
class AudioClip;
class SampleIterator;
class AudioClip {
public:
using value_type = float;
using size_type = int64_t;
using difference_type = int64_t;
using iterator = SampleIterator;
using SampleReader = std::function<value_type(size_type)>;
virtual ~AudioClip() {}
virtual std::unique_ptr<AudioClip> clone() const = 0;
virtual int getSampleRate() const = 0;
virtual size_type size() const = 0;
TimeRange getTruncatedRange() const;
SampleReader createSampleReader() const;
iterator begin() const;
iterator end() const;
private:
virtual SampleReader createUnsafeSampleReader() const = 0;
};
using AudioEffect = std::function<std::unique_ptr<AudioClip>(std::unique_ptr<AudioClip>)>;
std::unique_ptr<AudioClip> operator|(std::unique_ptr<AudioClip> clip, AudioEffect effect);
using SampleReader = AudioClip::SampleReader;
class SampleIterator {
public:
using value_type = AudioClip::value_type;
using size_type = AudioClip::size_type;
using difference_type = AudioClip::difference_type;
SampleIterator();
size_type getSampleIndex() const;
void seek(size_type sampleIndex);
value_type operator*() const;
value_type operator[](difference_type n) const;
private:
friend AudioClip;
SampleIterator(const AudioClip& audioClip, size_type sampleIndex);
Lazy<SampleReader> sampleReader;
size_type sampleIndex;
};
inline SampleIterator::size_type SampleIterator::getSampleIndex() const {
return sampleIndex;
}
inline void SampleIterator::seek(size_type sampleIndex) {
this->sampleIndex = sampleIndex;
}
inline SampleIterator::value_type SampleIterator::operator*() const {
return (*sampleReader)(sampleIndex);
}
inline SampleIterator::value_type SampleIterator::operator[](difference_type n) const {
return (*sampleReader)(sampleIndex + n);
}
inline bool operator==(const SampleIterator& lhs, const SampleIterator& rhs) {
return lhs.getSampleIndex() == rhs.getSampleIndex();
}
inline bool operator!=(const SampleIterator& lhs, const SampleIterator& rhs) {
return lhs.getSampleIndex() != rhs.getSampleIndex();
}
inline bool operator<(const SampleIterator& lhs, const SampleIterator& rhs) {
return lhs.getSampleIndex() < rhs.getSampleIndex();
}
inline bool operator>(const SampleIterator& lhs, const SampleIterator& rhs) {
return lhs.getSampleIndex() > rhs.getSampleIndex();
}
inline bool operator<=(const SampleIterator& lhs, const SampleIterator& rhs) {
return lhs.getSampleIndex() <= rhs.getSampleIndex();
}
inline bool operator>=(const SampleIterator& lhs, const SampleIterator& rhs) {
return lhs.getSampleIndex() >= rhs.getSampleIndex();
}
inline SampleIterator& operator+=(SampleIterator& it, SampleIterator::difference_type n) {
it.seek(it.getSampleIndex() + n);
return it;
}
inline SampleIterator& operator-=(SampleIterator& it, SampleIterator::difference_type n) {
it.seek(it.getSampleIndex() - n);
return it;
}
inline SampleIterator& operator++(SampleIterator& it) {
return operator+=(it, 1);
}
inline SampleIterator operator++(SampleIterator& it, int) {
SampleIterator tmp(it);
operator++(it);
return tmp;
}
inline SampleIterator& operator--(SampleIterator& it) {
return operator-=(it, 1);
}
inline SampleIterator operator--(SampleIterator& it, int) {
SampleIterator tmp(it);
operator--(it);
return tmp;
}
inline SampleIterator operator+(const SampleIterator& it, SampleIterator::difference_type n) {
SampleIterator result(it);
result += n;
return result;
}
inline SampleIterator operator-(const SampleIterator& it, SampleIterator::difference_type n) {
SampleIterator result(it);
result -= n;
return result;
}
inline SampleIterator::difference_type operator-(const SampleIterator& lhs, const SampleIterator& rhs) {
return lhs.getSampleIndex() - rhs.getSampleIndex();
}

View File

@ -0,0 +1,30 @@
#include "AudioSegment.h"
using std::unique_ptr;
using std::make_unique;
AudioSegment::AudioSegment(std::unique_ptr<AudioClip> inputClip, const TimeRange& range) :
inputClip(std::move(inputClip)),
sampleOffset(static_cast<int64_t>(range.getStart().count()) * this->inputClip->getSampleRate() / 100),
sampleCount(static_cast<int64_t>(range.getLength().count()) * this->inputClip->getSampleRate() / 100)
{
if (sampleOffset < 0 || sampleOffset + sampleCount > this->inputClip->size()) {
throw std::invalid_argument("Segment extends beyond input clip.");
}
}
unique_ptr<AudioClip> AudioSegment::clone() const {
return make_unique<AudioSegment>(*this);
}
SampleReader AudioSegment::createUnsafeSampleReader() const {
return [read = inputClip->createSampleReader(), sampleOffset = sampleOffset](size_type index) {
return read(index + sampleOffset);
};
}
AudioEffect segment(const TimeRange& range) {
return [range](unique_ptr<AudioClip> inputClip) {
return make_unique<AudioSegment>(std::move(inputClip), range);
};
}

26
src/audio/AudioSegment.h Normal file
View File

@ -0,0 +1,26 @@
#pragma once
#include "AudioClip.h"
class AudioSegment : public AudioClip {
public:
AudioSegment(std::unique_ptr<AudioClip> inputClip, const TimeRange& range);
std::unique_ptr<AudioClip> clone() const override;
int getSampleRate() const override;
size_type size() const override;
private:
SampleReader createUnsafeSampleReader() const override;
std::shared_ptr<AudioClip> inputClip;
size_type sampleOffset, sampleCount;
};
inline int AudioSegment::getSampleRate() const {
return inputClip->getSampleRate();
}
inline AudioClip::size_type AudioSegment::size() const {
return sampleCount;
}
AudioEffect segment(const TimeRange& range);

View File

@ -1,9 +0,0 @@
#include "AudioStream.h"
TimeRange AudioStream::getTruncatedRange() const {
return TimeRange(0cs, centiseconds(100 * getSampleCount() / getSampleRate()));
}
bool AudioStream::endOfStream() const {
return getSampleIndex() >= getSampleCount();
}

View File

@ -1,18 +0,0 @@
#pragma once
#include <memory>
#include "TimeRange.h"
// A mono stream of floating-point samples.
class AudioStream {
public:
virtual ~AudioStream() {}
virtual std::unique_ptr<AudioStream> clone(bool reset) const = 0;
virtual int getSampleRate() const = 0;
virtual int64_t getSampleCount() const = 0;
TimeRange getTruncatedRange() const;
virtual int64_t getSampleIndex() const = 0;
virtual void seek(int64_t sampleIndex) = 0;
bool endOfStream() const;
virtual float readSample() = 0;
};

View File

@ -1,50 +0,0 @@
#include "AudioStreamSegment.h"
#include <stdexcept>
AudioStreamSegment::AudioStreamSegment(std::unique_ptr<AudioStream> audioStream, const TimeRange& range) :
audioStream(std::move(audioStream)),
sampleOffset(static_cast<int64_t>(range.getStart().count()) * this->audioStream->getSampleRate() / 100),
sampleCount(static_cast<int64_t>(range.getLength().count()) * this->audioStream->getSampleRate() / 100)
{
seek(0);
if (sampleOffset < 0 || sampleOffset + sampleCount > this->audioStream->getSampleCount()) {
throw std::invalid_argument("Segment extends beyond input stream.");
}
}
AudioStreamSegment::AudioStreamSegment(const AudioStreamSegment& rhs, bool reset) :
audioStream(rhs.audioStream->clone(false)),
sampleOffset(rhs.sampleOffset),
sampleCount(rhs.sampleCount)
{
if (reset) seek(0);
}
std::unique_ptr<AudioStream> AudioStreamSegment::clone(bool reset) const {
return std::make_unique<AudioStreamSegment>(*this, reset);
}
int AudioStreamSegment::getSampleRate() const {
return audioStream->getSampleRate();
}
int64_t AudioStreamSegment::getSampleCount() const {
return sampleCount;
}
int64_t AudioStreamSegment::getSampleIndex() const {
return audioStream->getSampleIndex() - sampleOffset;
}
void AudioStreamSegment::seek(int64_t sampleIndex) {
audioStream->seek(sampleIndex + sampleOffset);
}
float AudioStreamSegment::readSample() {
return audioStream->readSample();
}
std::unique_ptr<AudioStream> createSegment(std::unique_ptr<AudioStream> audioStream, const TimeRange& range) {
return std::make_unique<AudioStreamSegment>(std::move(audioStream), range);
}

View File

@ -1,21 +0,0 @@
#pragma once
#include <audio/AudioStream.h>
#include <TimeRange.h>
class AudioStreamSegment : public AudioStream {
public:
AudioStreamSegment(std::unique_ptr<AudioStream> audioStream, const TimeRange& range);
AudioStreamSegment(const AudioStreamSegment& rhs, bool reset);
std::unique_ptr<AudioStream> clone(bool reset) const override;
int getSampleRate() const override;
int64_t getSampleCount() const override;
int64_t getSampleIndex() const override;
void seek(int64_t sampleIndex) override;
float readSample() override;
private:
std::unique_ptr<AudioStream> audioStream;
const int64_t sampleOffset, sampleCount;
};
std::unique_ptr<AudioStream> createSegment(std::unique_ptr<AudioStream> audioStream, const TimeRange& range);

View File

@ -1,73 +1,46 @@
#include "DCOffset.h"
#include <gsl_util.h>
#include <cmath>
DCOffset::DCOffset(std::unique_ptr<AudioStream> inputStream, float offset) :
inputStream(std::move(inputStream)),
using std::unique_ptr;
using std::make_unique;
DCOffset::DCOffset(unique_ptr<AudioClip> inputClip, float offset) :
inputClip(std::move(inputClip)),
offset(offset),
factor(1 / (1 + std::abs(offset)))
{}
DCOffset::DCOffset(const DCOffset& rhs, bool reset) :
inputStream(rhs.inputStream->clone(reset)),
offset(rhs.offset),
factor(rhs.factor)
{}
std::unique_ptr<AudioStream> DCOffset::clone(bool reset) const {
return std::make_unique<DCOffset>(*this, reset);
unique_ptr<AudioClip> DCOffset::clone() const {
return make_unique<DCOffset>(*this);
}
int DCOffset::getSampleRate() const {
return inputStream->getSampleRate();
}
int64_t DCOffset::getSampleCount() const {
return inputStream->getSampleCount();
}
int64_t DCOffset::getSampleIndex() const {
return inputStream->getSampleIndex();
}
void DCOffset::seek(int64_t sampleIndex) {
inputStream->seek(sampleIndex);
}
float DCOffset::readSample() {
float sample = inputStream->readSample();
SampleReader DCOffset::createUnsafeSampleReader() const {
return [read = inputClip->createSampleReader(), factor = factor, offset = offset](size_type index) {
float sample = read(index);
return sample * factor + offset;
};
}
std::unique_ptr<AudioStream> addDCOffset(std::unique_ptr<AudioStream> audioStream, float offset, float epsilon) {
if (std::abs(offset) < epsilon) return audioStream;
return std::make_unique<DCOffset>(std::move(audioStream), offset);
}
float getDCOffset(AudioStream& audioStream) {
float getDCOffset(const AudioClip& audioClip) {
int flatMeanSampleCount, fadingMeanSampleCount;
int sampleRate = audioStream.getSampleRate();
if (audioStream.getSampleCount() > 4 * sampleRate) {
int sampleRate = audioClip.getSampleRate();
if (audioClip.size() > 4 * sampleRate) {
// Long audio file. Average over the first 3 seconds, then fade out over the 4th.
flatMeanSampleCount = 3 * sampleRate;
fadingMeanSampleCount = 1 * sampleRate;
} else {
// Short audio file. Average over the entire length.
flatMeanSampleCount = static_cast<int>(audioStream.getSampleCount());
flatMeanSampleCount = static_cast<int>(audioClip.size());
fadingMeanSampleCount = 0;
}
int64_t originalSampleIndex = audioStream.getSampleIndex();
audioStream.seek(0);
auto restorePosition = gsl::finally([&]() { audioStream.seek(originalSampleIndex); });
auto read = audioClip.createSampleReader();
double sum = 0;
for (int i = 0; i < flatMeanSampleCount; i++) {
sum += audioStream.readSample();
for (int i = 0; i < flatMeanSampleCount; ++i) {
sum += read(i);
}
for (int i = 0; i < fadingMeanSampleCount; i++) {
for (int i = 0; i < fadingMeanSampleCount; ++i) {
double weight = static_cast<double>(fadingMeanSampleCount - i) / fadingMeanSampleCount;
sum += audioStream.readSample() * weight;
sum += read(flatMeanSampleCount + i) * weight;
}
double totalWeight = flatMeanSampleCount + (fadingMeanSampleCount + 1) / 2.0;
@ -75,7 +48,16 @@ float getDCOffset(AudioStream& audioStream) {
return static_cast<float>(offset);
}
std::unique_ptr<AudioStream> removeDCOffset(std::unique_ptr<AudioStream> inputStream) {
float offset = getDCOffset(*inputStream.get());
return addDCOffset(std::move(inputStream), -offset);
AudioEffect addDCOffset(float offset, float epsilon) {
return [offset, epsilon](unique_ptr<AudioClip> inputClip) -> unique_ptr<AudioClip> {
if (std::abs(offset) < epsilon) return std::move(inputClip);
return make_unique<DCOffset>(std::move(inputClip), offset);
};
}
AudioEffect removeDCOffset(float epsilon) {
return [epsilon](unique_ptr<AudioClip> inputClip) {
float offset = getDCOffset(*inputClip);
return std::move(inputClip) | addDCOffset(-offset, epsilon);
};
}

View File

@ -1,26 +1,32 @@
#pragma once
#include "AudioStream.h"
#include "AudioClip.h"
// Applies a constant DC offset to an audio stream and reduces its amplitude
// Applies a constant DC offset to an audio clip and reduces its amplitude
// to prevent clipping
class DCOffset : public AudioStream {
class DCOffset : public AudioClip {
public:
DCOffset(std::unique_ptr<AudioStream> inputStream, float offset);
DCOffset(const DCOffset& rhs, bool reset);
std::unique_ptr<AudioStream> clone(bool reset) const override;
DCOffset(std::unique_ptr<AudioClip> inputClip, float offset);
std::unique_ptr<AudioClip> clone() const override;
int getSampleRate() const override;
int64_t getSampleCount() const override;
int64_t getSampleIndex() const override;
void seek(int64_t sampleIndex) override;
float readSample() override;
size_type size() const override;
private:
std::unique_ptr<AudioStream> inputStream;
SampleReader createUnsafeSampleReader() const override;
std::shared_ptr<AudioClip> inputClip;
float offset;
float factor;
};
std::unique_ptr<AudioStream> addDCOffset(std::unique_ptr<AudioStream> audioStream, float offset, float epsilon = 1.0f / 15000);
inline int DCOffset::getSampleRate() const {
return inputClip->getSampleRate();
}
std::unique_ptr<AudioStream> removeDCOffset(std::unique_ptr<AudioStream> audioStream);
inline AudioClip::size_type DCOffset::size() const {
return inputClip->size();
}
float getDCOffset(const AudioClip& audioClip);
AudioEffect addDCOffset(float offset, float epsilon = 1.0f / 15000);
AudioEffect removeDCOffset(float epsilon = 1.0f / 15000);

View File

@ -1,105 +1,62 @@
#include <cmath>
#include "SampleRateConverter.h"
#include <stdexcept>
#include <algorithm>
#include <format.h>
using std::invalid_argument;
using std::unique_ptr;
using std::make_unique;
SampleRateConverter::SampleRateConverter(std::unique_ptr<AudioStream> inputStream, int outputSampleRate) :
inputStream(std::move(inputStream)),
downscalingFactor(static_cast<double>(this->inputStream->getSampleRate()) / outputSampleRate),
SampleRateConverter::SampleRateConverter(unique_ptr<AudioClip> inputClip, int outputSampleRate) :
inputClip(std::move(inputClip)),
downscalingFactor(static_cast<double>(this->inputClip->getSampleRate()) / outputSampleRate),
outputSampleRate(outputSampleRate),
outputSampleCount(std::lround(this->inputStream->getSampleCount() / downscalingFactor)),
lastInputSample(0),
lastInputSampleIndex(-1),
nextOutputSampleIndex(0)
outputSampleCount(std::lround(this->inputClip->size() / downscalingFactor))
{
if (outputSampleRate <= 0) {
throw invalid_argument("Sample rate must be positive.");
}
if (this->inputStream->getSampleRate() < outputSampleRate) {
throw invalid_argument(fmt::format("Upsampling not supported. Audio sample rate must not be below {}Hz.", outputSampleRate));
if (this->inputClip->getSampleRate() < outputSampleRate) {
throw invalid_argument(fmt::format("Upsampling not supported. Input sample rate must not be below {}Hz.", outputSampleRate));
}
}
SampleRateConverter::SampleRateConverter(const SampleRateConverter& rhs, bool reset) :
SampleRateConverter(rhs.inputStream->clone(reset), rhs.outputSampleRate)
{
nextOutputSampleIndex = reset ? 0 : rhs.nextOutputSampleIndex;
unique_ptr<AudioClip> SampleRateConverter::clone() const {
return make_unique<SampleRateConverter>(*this);
}
std::unique_ptr<AudioStream> SampleRateConverter::clone(bool reset) const {
return std::make_unique<SampleRateConverter>(*this, reset);
}
int SampleRateConverter::getSampleRate() const {
return outputSampleRate;
}
int64_t SampleRateConverter::getSampleCount() const {
return outputSampleCount;
}
int64_t SampleRateConverter::getSampleIndex() const {
return nextOutputSampleIndex;
}
void SampleRateConverter::seek(int64_t sampleIndex) {
if (sampleIndex < 0 || sampleIndex >= outputSampleCount) throw std::invalid_argument("sampleIndex out of range.");
nextOutputSampleIndex = sampleIndex;
}
float SampleRateConverter::readSample() {
if (nextOutputSampleIndex >= outputSampleCount) throw std::out_of_range("End of stream.");
double inputStart = nextOutputSampleIndex * downscalingFactor;
double inputEnd = (nextOutputSampleIndex + 1) * downscalingFactor;
nextOutputSampleIndex++;
return mean(inputStart, inputEnd);
}
float SampleRateConverter::mean(double inputStart, double inputEnd) {
float mean(double inputStart, double inputEnd, const SampleReader& read) {
// Calculate weighted sum...
double sum = 0;
// ... first sample (weight <= 1)
int64_t startIndex = static_cast<int64_t>(inputStart);
sum += getInputSample(startIndex) * ((startIndex + 1) - inputStart);
sum += read(startIndex) * ((startIndex + 1) - inputStart);
// ... middle samples (weight 1 each)
int64_t endIndex = static_cast<int64_t>(inputEnd);
for (int64_t index = startIndex + 1; index < endIndex; ++index) {
sum += getInputSample(index);
sum += read(index);
}
// ... last sample (weight < 1)
sum += getInputSample(endIndex) * (inputEnd - endIndex);
if (endIndex < inputEnd) {
sum += read(endIndex) * (inputEnd - endIndex);
}
return static_cast<float>(sum / (inputEnd - inputStart));
}
float SampleRateConverter::getInputSample(int64_t sampleIndex) {
sampleIndex = std::min(sampleIndex, inputStream->getSampleCount() - 1);
if (sampleIndex < 0) return 0.0f;
if (sampleIndex == lastInputSampleIndex) {
return lastInputSample;
}
if (sampleIndex != inputStream->getSampleIndex()) {
inputStream->seek(sampleIndex);
}
lastInputSample = inputStream->readSample();
lastInputSampleIndex = sampleIndex;
return lastInputSample;
SampleReader SampleRateConverter::createUnsafeSampleReader() const {
return[read = inputClip->createSampleReader(), downscalingFactor = downscalingFactor, size = size()](size_type index) {
double inputStart = index * downscalingFactor;
double inputEnd = std::min((index + 1) * downscalingFactor, static_cast<double>(size));
return mean(inputStart, inputEnd, read);
};
}
std::unique_ptr<AudioStream> convertSampleRate(std::unique_ptr<AudioStream> audioStream, int sampleRate) {
if (sampleRate == audioStream->getSampleRate()) {
return audioStream;
}
return std::make_unique<SampleRateConverter>(std::move(audioStream), sampleRate);
AudioEffect resample(int sampleRate) {
return [sampleRate](unique_ptr<AudioClip> inputClip) {
return make_unique<SampleRateConverter>(std::move(inputClip), sampleRate);
};
}

View File

@ -1,32 +1,29 @@
#pragma once
#include <memory>
#include "AudioStream.h"
#include "AudioClip.h"
class SampleRateConverter : public AudioStream {
class SampleRateConverter : public AudioClip {
public:
SampleRateConverter(std::unique_ptr<AudioStream> inputStream, int outputSampleRate);
SampleRateConverter(const SampleRateConverter& rhs, bool reset);
std::unique_ptr<AudioStream> clone(bool reset) const override;
SampleRateConverter(std::unique_ptr<AudioClip> inputClip, int outputSampleRate);
std::unique_ptr<AudioClip> clone() const override;
int getSampleRate() const override;
int64_t getSampleCount() const override;
int64_t getSampleIndex() const override;
void seek(int64_t sampleIndex) override;
float readSample() override;
size_type size() const override;
private:
std::unique_ptr<AudioStream> inputStream;
double downscalingFactor; // input sample rate / output sample rate
SampleReader createUnsafeSampleReader() const override;
std::shared_ptr<AudioClip> inputClip;
double downscalingFactor; // input sample rate / output sample rate
int outputSampleRate;
int64_t outputSampleCount;
float lastInputSample;
int64_t lastInputSampleIndex;
int64_t nextOutputSampleIndex;
float mean(double start, double end);
float getInputSample(int64_t sampleIndex);
};
std::unique_ptr<AudioStream> convertSampleRate(std::unique_ptr<AudioStream> audioStream, int sampleRate);
AudioEffect resample(int sampleRate);
inline int SampleRateConverter::getSampleRate() const {
return outputSampleRate;
}
inline AudioClip::size_type SampleRateConverter::size() const {
return outputSampleCount;
}

View File

@ -1,59 +0,0 @@
#include "UnboundedStream.h"
using boost::optional;
UnboundedStream::UnboundedStream(std::unique_ptr<AudioStream> inputStream) :
innerStream(std::move(inputStream)),
sampleIndex(innerStream->getSampleIndex()),
firstSample(inputStream->getSampleCount() ? optional<float>() : 0.0f),
lastSample(inputStream->getSampleCount() ? optional<float>() : 0.0f)
{}
UnboundedStream::UnboundedStream(const UnboundedStream& rhs, bool reset) :
innerStream(rhs.innerStream->clone(reset)),
sampleIndex(rhs.sampleIndex),
firstSample(rhs.firstSample),
lastSample(rhs.lastSample)
{}
std::unique_ptr<AudioStream> UnboundedStream::clone(bool reset) const {
return std::make_unique<UnboundedStream>(*this, reset);
}
int UnboundedStream::getSampleRate() const {
return innerStream->getSampleRate();
}
int64_t UnboundedStream::getSampleCount() const {
return innerStream->getSampleCount();
}
int64_t UnboundedStream::getSampleIndex() const {
return sampleIndex;
}
void UnboundedStream::seek(int64_t sampleIndex) {
this->sampleIndex = sampleIndex;
}
float UnboundedStream::readSample() {
if (sampleIndex < 0) {
if (!firstSample) {
innerStream->seek(0);
firstSample = innerStream->readSample();
}
return firstSample.get();
}
if (sampleIndex >= innerStream->getSampleCount()) {
if (!lastSample) {
innerStream->seek(innerStream->getSampleCount() - 1);
lastSample = innerStream->readSample();
}
return lastSample.get();
}
if (sampleIndex != innerStream->getSampleIndex()) {
innerStream->seek(sampleIndex);
}
return innerStream->readSample();
}

View File

@ -1,22 +0,0 @@
#pragma once
#include "AudioStream.h"
#include <boost/optional/optional.hpp>
// Stream wrapper that allows reading before the start and past the end of the input stream.
class UnboundedStream : public AudioStream {
public:
UnboundedStream(std::unique_ptr<AudioStream> inputStream);
UnboundedStream(const UnboundedStream& rhs, bool reset);
std::unique_ptr<AudioStream> clone(bool reset) const override;
int getSampleRate() const override;
int64_t getSampleCount() const override;
int64_t getSampleIndex() const override;
void seek(int64_t sampleIndex) override;
float readSample() override;
private:
std::unique_ptr<AudioStream> innerStream;
int64_t sampleIndex;
boost::optional<float> firstSample, lastSample;
};

View File

@ -7,6 +7,10 @@ using std::runtime_error;
using fmt::format;
using std::string;
using namespace little_endian;
using std::unique_ptr;
using std::make_unique;
using std::make_shared;
using boost::filesystem::path;
#define INT24_MIN (-8388608)
#define INT24_MAX 8388607
@ -25,12 +29,34 @@ enum class Codec {
Float = 0x03
};
WaveFileReader::WaveFileReader(boost::filesystem::path filePath) :
std::ifstream openFile(path filePath) {
try {
std::ifstream file;
file.exceptions(std::ifstream::failbit | std::ifstream::badbit);
file.open(filePath.c_str(), std::ios::binary);
// Error messages on stream exceptions are mostly useless.
// Read some dummy data so that we can throw a decent exception in case the file is missing, locked, etc.
file.seekg(0, std::ios_base::end);
if (file.tellg()) {
file.seekg(0);
file.get();
file.seekg(0);
}
return std::move(file);
} catch (const std::ifstream::failure&) {
char message[256];
strerror_s(message, sizeof message, errno);
throw runtime_error(message);
}
}
WaveFileReader::WaveFileReader(path filePath) :
filePath(filePath),
file(),
frameIndex(0)
formatInfo{}
{
openFile();
auto file = openFile(filePath);
file.seekg(0, std::ios_base::end);
std::streamoff fileSize = file.tellg();
@ -57,16 +83,15 @@ WaveFileReader::WaveFileReader(boost::filesystem::path filePath) :
// Read chunks until we reach the data chunk
bool reachedDataChunk = false;
bytesPerSample = 0;
while (!reachedDataChunk && remaining(8)) {
uint32_t chunkId = read<uint32_t>(file);
int chunkSize = read<uint32_t>(file);
switch (chunkId) {
case fourcc('f', 'm', 't', ' '): {
// Read relevant data
Codec codec = (Codec)read<uint16_t>(file);
channelCount = read<uint16_t>(file);
frameRate = read<uint32_t>(file);
Codec codec = static_cast<Codec>(read<uint16_t>(file));
formatInfo.channelCount = read<uint16_t>(file);
formatInfo.frameRate = read<uint32_t>(file);
read<uint32_t>(file); // Bytes per second
int frameSize = read<uint16_t>(file);
int bitsPerSample = read<uint16_t>(file);
@ -75,31 +100,32 @@ WaveFileReader::WaveFileReader(boost::filesystem::path filePath) :
file.seekg(roundToEven(chunkSize) - 16, file.cur);
// Determine sample format
int bytesPerSample;
switch (codec) {
case Codec::PCM:
// Determine sample size.
// According to the WAVE standard, sample sizes that are not multiples of 8 bits
// (e.g. 12 bits) can be treated like the next-larger byte size.
if (bitsPerSample == 8) {
sampleFormat = SampleFormat::UInt8;
formatInfo.sampleFormat = SampleFormat::UInt8;
bytesPerSample = 1;
} else if (bitsPerSample <= 16) {
sampleFormat = SampleFormat::Int16;
formatInfo.sampleFormat = SampleFormat::Int16;
bytesPerSample = 2;
} else if (bitsPerSample <= 24) {
sampleFormat = SampleFormat::Int24;
formatInfo.sampleFormat = SampleFormat::Int24;
bytesPerSample = 3;
} else {
throw runtime_error(
format("Unsupported sample format: {}-bit integer samples.", bitsPerSample));
}
if (bytesPerSample != frameSize / channelCount) {
if (bytesPerSample != frameSize / formatInfo.channelCount) {
throw runtime_error("Unsupported sample organization.");
}
break;
case Codec::Float:
if (bitsPerSample == 32) {
sampleFormat = SampleFormat::Float32;
formatInfo.sampleFormat = SampleFormat::Float32;
bytesPerSample = 4;
} else {
throw runtime_error(format("Unsupported sample format: {}-bit floating-point samples.", bitsPerSample));
@ -108,13 +134,13 @@ WaveFileReader::WaveFileReader(boost::filesystem::path filePath) :
default:
throw runtime_error("Unsupported sample format. Only uncompressed formats are supported.");
}
formatInfo.bytesPerFrame = bytesPerSample * formatInfo.channelCount;
break;
}
case fourcc('d', 'a', 't', 'a'): {
reachedDataChunk = true;
dataOffset = file.tellg();
int sampleCount = chunkSize / bytesPerSample;
frameCount = sampleCount / channelCount;
formatInfo.dataOffset = file.tellg();
formatInfo.frameCount = chunkSize / formatInfo.bytesPerFrame;
break;
}
default: {
@ -124,75 +150,13 @@ WaveFileReader::WaveFileReader(boost::filesystem::path filePath) :
}
}
}
if (!reachedDataChunk) {
dataOffset = file.tellg();
frameCount = 0;
}
}
WaveFileReader::WaveFileReader(const WaveFileReader& rhs, bool reset) :
filePath(rhs.filePath),
file(),
bytesPerSample(rhs.bytesPerSample),
sampleFormat(rhs.sampleFormat),
frameRate(rhs.frameRate),
frameCount(rhs.frameCount),
channelCount(rhs.channelCount),
dataOffset(rhs.dataOffset),
frameIndex(-1)
{
openFile();
seek(reset ? 0 : rhs.frameIndex);
unique_ptr<AudioClip> WaveFileReader::clone() const {
return make_unique<WaveFileReader>(*this);
}
std::unique_ptr<AudioStream> WaveFileReader::clone(bool reset) const {
return std::make_unique<WaveFileReader>(*this, reset);
}
void WaveFileReader::openFile() {
try {
file.exceptions(std::ifstream::failbit | std::ifstream::badbit);
file.open(filePath, std::ios::binary);
// Error messages on stream exceptions are mostly useless.
// Read some dummy data so that we can throw a decent exception in case the file is missing, locked, etc.
file.seekg(0, std::ios_base::end);
if (file.tellg()) {
file.seekg(0);
file.get();
file.seekg(0);
}
} catch (const std::ifstream::failure&) {
char message[256];
strerror_s(message, sizeof message, errno);
throw runtime_error(message);
}
}
int WaveFileReader::getSampleRate() const {
return frameRate;
}
int64_t WaveFileReader::getSampleCount() const {
return frameCount;
}
int64_t WaveFileReader::getSampleIndex() const {
return frameIndex;
}
void WaveFileReader::seek(int64_t frameIndex) {
if (frameIndex < 0 || frameIndex > frameCount) throw std::invalid_argument("frameIndex out of range.");
file.seekg(dataOffset + static_cast<std::streamoff>(frameIndex * channelCount * bytesPerSample));
this->frameIndex = frameIndex;
}
float WaveFileReader::readSample() {
if (frameIndex >= frameCount) throw std::out_of_range("End of stream.");
++frameIndex;
inline AudioClip::value_type readSample(std::ifstream& file, SampleFormat sampleFormat, int channelCount) {
float sum = 0;
for (int channelIndex = 0; channelIndex < channelCount; channelIndex++) {
switch (sampleFormat) {
@ -221,3 +185,13 @@ float WaveFileReader::readSample() {
return sum / channelCount;
}
SampleReader WaveFileReader::createUnsafeSampleReader() const {
return [formatInfo = formatInfo, file = std::make_shared<std::ifstream>(openFile(filePath)), filePos = std::streampos(0)](size_type index) mutable {
std::streampos newFilePos = formatInfo.dataOffset + static_cast<std::streamoff>(index * formatInfo.bytesPerFrame);
file->seekg(newFilePos);
value_type result = readSample(*file, formatInfo.sampleFormat, formatInfo.channelCount);
filePos = newFilePos + static_cast<std::streamoff>(formatInfo.bytesPerFrame);
return result;
};
}

View File

@ -1,8 +1,7 @@
#pragma once
#include <boost/filesystem/path.hpp>
#include <boost/filesystem/fstream.hpp>
#include "AudioStream.h"
#include "AudioClip.h"
enum class SampleFormat {
UInt8,
@ -11,28 +10,33 @@ enum class SampleFormat {
Float32
};
class WaveFileReader : public AudioStream {
class WaveFileReader : public AudioClip {
public:
WaveFileReader(boost::filesystem::path filePath);
WaveFileReader(const WaveFileReader& rhs, bool reset);
std::unique_ptr<AudioStream> clone(bool reset) const override;
int getSampleRate() const override ;
int64_t getSampleCount() const override;
int64_t getSampleIndex() const override;
void seek(int64_t sampleIndex) override;
float readSample() override;
std::unique_ptr<AudioClip> clone() const override;
int getSampleRate() const override;
size_type size() const override;
private:
void openFile();
SampleReader createUnsafeSampleReader() const override;
private:
boost::filesystem::path filePath;
boost::filesystem::ifstream file;
int bytesPerSample;
struct WaveFormatInfo {
int bytesPerFrame;
SampleFormat sampleFormat;
int frameRate;
int64_t frameCount;
int channelCount;
std::streampos dataOffset;
int64_t frameIndex;
};
boost::filesystem::path filePath;
WaveFormatInfo formatInfo;
};
inline int WaveFileReader::getSampleRate() const {
return formatInfo.frameRate;
}
inline AudioClip::size_type WaveFileReader::size() const {
return formatInfo.frameCount;
}

View File

@ -10,31 +10,31 @@ inline int16_t floatSampleToInt16(float sample) {
return static_cast<int16_t>(((sample + 1) / 2) * (INT16_MAX - INT16_MIN) + INT16_MIN);
}
void process16bitAudioStream(AudioStream& audioStream, function<void(const vector<int16_t>&)> processBuffer, size_t bufferCapacity, ProgressSink& progressSink) {
void process16bitAudioClip(const AudioClip& audioClip, function<void(const vector<int16_t>&)> processBuffer, size_t bufferCapacity, ProgressSink& progressSink) {
// Process entire sound stream
vector<int16_t> buffer;
buffer.reserve(bufferCapacity);
int sampleCount = 0;
auto it = audioClip.begin();
auto end = audioClip.end();
do {
// Read to buffer
buffer.clear();
while (buffer.size() < bufferCapacity && !audioStream.endOfStream()) {
// Read sample
float floatSample = audioStream.readSample();
int16_t sample = floatSampleToInt16(floatSample);
buffer.push_back(sample);
for (; buffer.size() < bufferCapacity && it != end; ++it) {
// Read sample to buffer
buffer.push_back(floatSampleToInt16(*it));
}
// Process buffer
processBuffer(buffer);
sampleCount += buffer.size();
progressSink.reportProgress(static_cast<double>(sampleCount) / audioStream.getSampleCount());
progressSink.reportProgress(static_cast<double>(sampleCount) / audioClip.size());
} while (buffer.size());
}
void process16bitAudioStream(AudioStream& audioStream, function<void(const vector<int16_t>&)> processBuffer, ProgressSink& progressSink) {
void process16bitAudioClip(const AudioClip& audioClip, function<void(const vector<int16_t>&)> processBuffer, ProgressSink& progressSink) {
const size_t capacity = 1600; // 0.1 second capacity
process16bitAudioStream(audioStream, processBuffer, capacity, progressSink);
process16bitAudioClip(audioClip, processBuffer, capacity, progressSink);
}

View File

@ -2,8 +2,8 @@
#include <vector>
#include <functional>
#include "audio/AudioStream.h"
#include "audio/AudioClip.h"
#include "ProgressBar.h"
void process16bitAudioStream(AudioStream& audioStream, std::function<void(const std::vector<int16_t>&)> processBuffer, size_t bufferCapacity, ProgressSink& progressSink);
void process16bitAudioStream(AudioStream& audioStream, std::function<void(const std::vector<int16_t>&)> processBuffer, ProgressSink& progressSink);
void process16bitAudioClip(const AudioClip& audioClip, std::function<void(const std::vector<int16_t>&)> processBuffer, size_t bufferCapacity, ProgressSink& progressSink);
void process16bitAudioClip(const AudioClip& audioClip, std::function<void(const std::vector<int16_t>&)> processBuffer, ProgressSink& progressSink);

View File

@ -8,7 +8,7 @@
#include "processing.h"
#include <gsl_util.h>
#include <parallel.h>
#include "AudioStreamSegment.h"
#include "AudioSegment.h"
using std::vector;
using boost::adaptors::transformed;
@ -16,7 +16,7 @@ using fmt::format;
using std::runtime_error;
using std::unique_ptr;
BoundedTimeline<void> webRtcDetectVoiceActivity(AudioStream& audioStream, ProgressSink& progressSink) {
BoundedTimeline<void> webRtcDetectVoiceActivity(const AudioClip& audioClip, ProgressSink& progressSink) {
VadInst* vadHandle = WebRtcVad_Create();
if (!vadHandle) throw runtime_error("Error creating WebRTC VAD handle.");
@ -30,14 +30,14 @@ BoundedTimeline<void> webRtcDetectVoiceActivity(AudioStream& audioStream, Progre
if (error) throw runtime_error("Error setting WebRTC VAD aggressiveness.");
// Detect activity
BoundedTimeline<void> activity(audioStream.getTruncatedRange());
BoundedTimeline<void> activity(audioClip.getTruncatedRange());
centiseconds time = 0cs;
const size_t bufferCapacity = audioStream.getSampleRate() / 100;
const size_t bufferCapacity = audioClip.getSampleRate() / 100;
auto processBuffer = [&](const vector<int16_t>& buffer) {
// WebRTC is picky regarding buffer size
if (buffer.size() < bufferCapacity) return;
int result = WebRtcVad_Process(vadHandle, audioStream.getSampleRate(), buffer.data(), buffer.size()) == 1;
int result = WebRtcVad_Process(vadHandle, audioClip.getSampleRate(), buffer.data(), buffer.size()) == 1;
if (result == -1) throw runtime_error("Error processing audio buffer using WebRTC VAD.");
bool isActive = result != 0;
@ -46,7 +46,7 @@ BoundedTimeline<void> webRtcDetectVoiceActivity(AudioStream& audioStream, Progre
}
time += 1cs;
};
process16bitAudioStream(*audioStream.clone(true), processBuffer, bufferCapacity, progressSink);
process16bitAudioClip(audioClip, processBuffer, bufferCapacity, progressSink);
// WebRTC adapts to the audio. This means results may not be correct at the very beginning.
// It sometimes returns false activity at the very beginning, mistaking the background noise for speech.
@ -54,31 +54,31 @@ BoundedTimeline<void> webRtcDetectVoiceActivity(AudioStream& audioStream, Progre
if (!activity.empty()) {
TimeRange firstActivity = activity.begin()->getTimeRange();
activity.clear(firstActivity);
unique_ptr<AudioStream> streamStart = createSegment(audioStream.clone(true), TimeRange(0cs, firstActivity.getEnd()));
unique_ptr<AudioClip> streamStart = audioClip.clone() | segment(TimeRange(0cs, firstActivity.getEnd()));
time = 0cs;
process16bitAudioStream(*streamStart, processBuffer, bufferCapacity, progressSink);
process16bitAudioClip(*streamStart, processBuffer, bufferCapacity, progressSink);
}
return activity;
}
BoundedTimeline<void> detectVoiceActivity(std::unique_ptr<AudioStream> audioStream, ProgressSink& progressSink) {
BoundedTimeline<void> detectVoiceActivity(const AudioClip& inputAudioClip, ProgressSink& progressSink) {
// Prepare audio for VAD
audioStream = removeDCOffset(convertSampleRate(std::move(audioStream), 16000));
const unique_ptr<AudioClip> audioClip = inputAudioClip.clone() | resample(16000) | removeDCOffset();
BoundedTimeline<void> activity(audioStream->getTruncatedRange());
BoundedTimeline<void> activity(audioClip->getTruncatedRange());
std::mutex activityMutex;
// Split audio into segments and perform parallel VAD
int segmentCount = getProcessorCoreCount();
centiseconds audioLength = audioStream->getTruncatedRange().getLength();
centiseconds audioLength = audioClip->getTruncatedRange().getLength();
vector<TimeRange> audioSegments;
for (int i = 0; i < segmentCount; ++i) {
TimeRange segmentRange = TimeRange(i * audioLength / segmentCount, (i + 1) * audioLength / segmentCount);
audioSegments.push_back(segmentRange);
}
runParallel([&](const TimeRange& segmentRange, ProgressSink& segmentProgressSink) {
unique_ptr<AudioStream> audioSegment = createSegment(audioStream->clone(false), segmentRange);
unique_ptr<AudioClip> audioSegment = audioClip->clone() | segment(segmentRange);
BoundedTimeline<void> activitySegment = webRtcDetectVoiceActivity(*audioSegment, segmentProgressSink);
std::lock_guard<std::mutex> lock(activityMutex);

View File

@ -1,7 +1,6 @@
#pragma once
#include <memory>
#include "AudioStream.h"
#include "AudioClip.h"
#include <BoundedTimeline.h>
#include <ProgressBar.h>
BoundedTimeline<void> detectVoiceActivity(std::unique_ptr<AudioStream> audioStream, ProgressSink& progressSink);
BoundedTimeline<void> detectVoiceActivity(const AudioClip& audioClip, ProgressSink& progressSink);

View File

@ -4,7 +4,7 @@
using namespace little_endian;
void createWaveFile(std::unique_ptr<AudioStream> inputStream, std::string fileName) {
void createWaveFile(const AudioClip& audioClip, std::string fileName) {
// Open file
std::ofstream file;
file.exceptions(std::ofstream::failbit | std::ofstream::badbit);
@ -15,7 +15,7 @@ void createWaveFile(std::unique_ptr<AudioStream> inputStream, std::string fileNa
uint32_t formatChunkSize = 16;
uint16_t channelCount = 1;
uint16_t frameSize = static_cast<uint16_t>(channelCount * sizeof(float));
uint32_t dataChunkSize = static_cast<uint32_t>(inputStream->getSampleCount() * frameSize);
uint32_t dataChunkSize = static_cast<uint32_t>(audioClip.size() * frameSize);
uint32_t riffChunkSize = 4 + (8 + formatChunkSize) + (8 + dataChunkSize);
write<uint32_t>(riffChunkSize, file);
write<uint32_t>(fourcc('W', 'A', 'V', 'E'), file);
@ -26,7 +26,7 @@ void createWaveFile(std::unique_ptr<AudioStream> inputStream, std::string fileNa
uint16_t codec = 0x03; // 32-bit float
write<uint16_t>(codec, file);
write<uint16_t>(channelCount, file);
uint32_t frameRate = static_cast<uint16_t>(inputStream->getSampleRate());
uint32_t frameRate = static_cast<uint16_t>(audioClip.getSampleRate());
write<uint32_t>(frameRate, file);
uint32_t bytesPerSecond = frameRate * frameSize;
write<uint32_t>(bytesPerSecond, file);
@ -37,8 +37,7 @@ void createWaveFile(std::unique_ptr<AudioStream> inputStream, std::string fileNa
// Write data chunk
write<uint32_t>(fourcc('d', 'a', 't', 'a'), file);
write<uint32_t>(dataChunkSize, file);
while (!inputStream->endOfStream()) {
float sample = inputStream->readSample();
for (float sample : audioClip) {
write<float>(sample, file);
}
}

View File

@ -1,7 +1,5 @@
#pragma once
#include <memory>
#include <string>
#include "AudioStream.h"
#include "AudioClip.h"
void createWaveFile(std::unique_ptr<AudioStream> inputStream, std::string fileName);
void createWaveFile(const AudioClip& audioClip, std::string fileName);

View File

@ -15,6 +15,7 @@
#include <boost/filesystem/operations.hpp>
#include "stringTools.h"
#include <boost/range/adaptor/transformed.hpp>
#include <boost/filesystem/fstream.hpp>
using std::exception;
using std::string;
@ -43,7 +44,7 @@ string getMessage(const exception& e) {
return result;
}
unique_ptr<AudioStream> createAudioStream(path filePath) {
unique_ptr<AudioClip> createAudioClip(path filePath) {
try {
return std::make_unique<WaveFileReader>(filePath);
} catch (...) {
@ -144,7 +145,7 @@ int main(int argc, char *argv[]) {
{
ProgressBar progressBar;
phones = detectPhones(
createAudioStream(inputFileName.getValue()),
*createAudioClip(inputFileName.getValue()),
dialogFile.isSet() ? readTextFile(path(dialogFile.getValue())) : boost::optional<u32string>(),
progressBar);
}

View File

@ -11,7 +11,7 @@
#include <audio/DCOffset.h>
#include <Timeline.h>
#include <audio/voiceActivityDetection.h>
#include <audio/AudioStreamSegment.h>
#include "audio/AudioSegment.h"
#include "languageModels.h"
#include "tokenization.h"
#include "g2p.h"
@ -95,9 +95,9 @@ void sphinxLogCallback(void* user_data, err_lvl_t errorLevel, const char* format
logging::log(logLevel, message);
}
BoundedTimeline<string> recognizeWords(unique_ptr<AudioStream> audioStream, ps_decoder_t& decoder, bool& decoderIsStillUsable, ProgressSink& progressSink) {
BoundedTimeline<string> recognizeWords(const AudioClip& inputAudioClip, ps_decoder_t& decoder, bool& decoderIsStillUsable, ProgressSink& progressSink) {
// Convert audio stream to the exact format PocketSphinx requires
audioStream = convertSampleRate(std::move(audioStream), sphinxSampleRate);
const unique_ptr<AudioClip> audioClip = inputAudioClip.clone() | resample(sphinxSampleRate);
// Restart timing at 0
ps_start_stream(&decoder);
@ -111,7 +111,7 @@ BoundedTimeline<string> recognizeWords(unique_ptr<AudioStream> audioStream, ps_d
int searchedFrameCount = ps_process_raw(&decoder, buffer.data(), buffer.size(), false, false);
if (searchedFrameCount < 0) throw runtime_error("Error analyzing raw audio data for word recognition.");
};
process16bitAudioStream(*audioStream.get(), processBuffer, progressSink);
process16bitAudioClip(*audioClip, processBuffer, progressSink);
// End recognition
error = ps_end_utt(&decoder);
@ -121,7 +121,7 @@ BoundedTimeline<string> recognizeWords(unique_ptr<AudioStream> audioStream, ps_d
// As a result, the following utterance will be garbage.
// As a workaround, we throw away the decoder in this case.
// See https://sourceforge.net/p/cmusphinx/discussion/help/thread/f1dd91c5/#7529
BoundedTimeline<string> result(audioStream->getTruncatedRange());
BoundedTimeline<string> result(audioClip->getTruncatedRange());
bool noWordsRecognized = reinterpret_cast<ngram_search_t*>(decoder.search)->bpidx == 0;
if (noWordsRecognized) {
decoderIsStillUsable = false;
@ -147,7 +147,7 @@ s3wid_t getWordId(const string& word, dict_t& dictionary) {
optional<Timeline<Phone>> getPhoneAlignment(
const vector<s3wid_t>& wordIds,
unique_ptr<AudioStream> audioStream,
const AudioClip& inputAudioClip,
ps_decoder_t& decoder,
ProgressSink& progressSink)
{
@ -164,7 +164,7 @@ optional<Timeline<Phone>> getPhoneAlignment(
if (error) throw runtime_error("Error populating alignment struct.");
// Convert audio stream to the exact format PocketSphinx requires
audioStream = convertSampleRate(std::move(audioStream), sphinxSampleRate);
const unique_ptr<AudioClip> audioClip = inputAudioClip.clone() | resample(sphinxSampleRate);
// Create search structure
acmod_t* acousticModel = decoder.acmod;
@ -195,7 +195,7 @@ optional<Timeline<Phone>> getPhoneAlignment(
}
}
};
process16bitAudioStream(*audioStream.get(), processBuffer, progressSink);
process16bitAudioClip(*audioClip, processBuffer, progressSink);
// End search
error = ps_search_finish(search.get());
@ -288,7 +288,7 @@ lambda_unique_ptr<ps_decoder_t> createDecoder(optional<u32string> dialog) {
}
Timeline<Phone> utteranceToPhones(
AudioStream& audioStream,
const AudioClip& audioClip,
TimeRange utterance,
ps_decoder_t& decoder,
bool& decoderIsStillUsable,
@ -298,10 +298,10 @@ Timeline<Phone> utteranceToPhones(
ProgressSink& wordRecognitionProgressSink = utteranceProgressMerger.addSink(1.0);
ProgressSink& alignmentProgressSink = utteranceProgressMerger.addSink(0.5);
auto streamSegment = createSegment(audioStream.clone(true), utterance);
const unique_ptr<AudioClip> clipSegment = audioClip.clone() | segment(utterance);
// Get words
BoundedTimeline<string> words = recognizeWords(streamSegment->clone(true), decoder, decoderIsStillUsable, wordRecognitionProgressSink);
BoundedTimeline<string> words = recognizeWords(*clipSegment, decoder, decoderIsStillUsable, wordRecognitionProgressSink);
for (Timed<string> timedWord : words) {
timedWord.getTimeRange().shift(utterance.getStart());
logging::logTimedEvent("word", timedWord);
@ -315,8 +315,8 @@ Timeline<Phone> utteranceToPhones(
if (wordIds.empty()) return Timeline<Phone>();
// Align the words' phones with speech
Timeline<Phone> segmentPhones = getPhoneAlignment(wordIds, std::move(streamSegment), decoder, alignmentProgressSink)
.value_or(ContinuousTimeline<Phone>(streamSegment->getTruncatedRange(), Phone::Unknown));
Timeline<Phone> segmentPhones = getPhoneAlignment(wordIds, *clipSegment, decoder, alignmentProgressSink)
.value_or(ContinuousTimeline<Phone>(clipSegment->getTruncatedRange(), Phone::Unknown));
segmentPhones.shift(utterance.getStart());
for (const auto& timedPhone : segmentPhones) {
logging::logTimedEvent("phone", timedPhone);
@ -326,7 +326,7 @@ Timeline<Phone> utteranceToPhones(
}
BoundedTimeline<Phone> detectPhones(
unique_ptr<AudioStream> audioStream,
const AudioClip& inputAudioClip,
optional<u32string> dialog,
ProgressSink& progressSink)
{
@ -335,12 +335,12 @@ BoundedTimeline<Phone> detectPhones(
ProgressSink& dialogProgressSink = totalProgressMerger.addSink(15);
// Make sure audio stream has no DC offset
audioStream = removeDCOffset(std::move(audioStream));
const unique_ptr<AudioClip> audioClip = inputAudioClip.clone() | removeDCOffset();
// Split audio into utterances
BoundedTimeline<void> utterances;
try {
utterances = detectVoiceActivity(audioStream->clone(true), voiceActivationProgressSink);
utterances = detectVoiceActivity(*audioClip, voiceActivationProgressSink);
}
catch (...) {
std::throw_with_nested(runtime_error("Error detecting segments of speech."));
@ -369,17 +369,16 @@ BoundedTimeline<Phone> detectPhones(
decoderPool.push(std::move(decoder));
};
BoundedTimeline<Phone> result(audioStream->getTruncatedRange());
BoundedTimeline<Phone> result(audioClip->getTruncatedRange());
std::mutex resultMutex;
auto processUtterance = [&](Timed<void> timedUtterance, ProgressSink& utteranceProgressSink) {
logging::logTimedEvent("utterance", timedUtterance.getTimeRange(), string(""));
// Detect phones for utterance
auto decoder = getDecoder();
auto audioStreamCopy = audioStream->clone(true);
bool decoderIsStillUsable = true;
Timeline<Phone> phones =
utteranceToPhones(*audioStreamCopy, timedUtterance.getTimeRange(), *decoder, decoderIsStillUsable, utteranceProgressSink);
utteranceToPhones(*audioClip, timedUtterance.getTimeRange(), *decoder, decoderIsStillUsable, utteranceProgressSink);
if (decoderIsStillUsable) {
returnDecoder(std::move(decoder));
}
@ -404,7 +403,7 @@ BoundedTimeline<Phone> detectPhones(
// Don't use more threads than there are utterances to be processed
static_cast<int>(utterances.size()),
// Don't waste time creating additional threads (and decoders!) if the recording is short
static_cast<int>(duration_cast<std::chrono::seconds>(audioStream->getTruncatedRange().getLength()).count() / 10)
static_cast<int>(duration_cast<std::chrono::seconds>(audioClip->getTruncatedRange().getLength()).count() / 10)
});
logging::debug("Speech recognition -- start");
runParallel(processUtterance, utterances, threadCount, dialogProgressSink, getUtteranceProgressWeight);

View File

@ -1,12 +1,11 @@
#pragma once
#include <memory>
#include "audio/AudioStream.h"
#include "audio/AudioClip.h"
#include "Phone.h"
#include "progressBar.h"
#include "BoundedTimeline.h"
BoundedTimeline<Phone> detectPhones(
std::unique_ptr<AudioStream> audioStream,
const AudioClip& audioClip,
boost::optional<std::u32string> dialog,
ProgressSink& progressSink);