Refactored audio handling
Now audio clips can be passed around as const references and don't carry state any more.
This commit is contained in:
parent
799f334fa7
commit
26cae93478
|
@ -197,11 +197,10 @@ set(SOURCE_FILES
|
||||||
src/phoneExtraction.cpp src/phoneExtraction.h
|
src/phoneExtraction.cpp src/phoneExtraction.h
|
||||||
src/platformTools.cpp src/platformTools.h
|
src/platformTools.cpp src/platformTools.h
|
||||||
src/tools.cpp src/tools.h
|
src/tools.cpp src/tools.h
|
||||||
src/audio/AudioStream.cpp src/audio/AudioStream.h
|
src/audio/AudioClip.cpp src/audio/AudioClip.h
|
||||||
src/audio/AudioStreamSegment.cpp src/audio/AudioStreamSegment.h
|
src/audio/AudioSegment.cpp src/audio/AudioSegment.h
|
||||||
src/audio/DCOffset.cpp src/audio/DCOffset.h
|
src/audio/DCOffset.cpp src/audio/DCOffset.h
|
||||||
src/audio/SampleRateConverter.cpp src/audio/SampleRateConverter.h
|
src/audio/SampleRateConverter.cpp src/audio/SampleRateConverter.h
|
||||||
src/audio/UnboundedStream.cpp src/audio/UnboundedStream.h
|
|
||||||
src/audio/voiceActivityDetection.cpp src/audio/voiceActivityDetection.h
|
src/audio/voiceActivityDetection.cpp src/audio/voiceActivityDetection.h
|
||||||
src/audio/WaveFileReader.cpp src/audio/WaveFileReader.h
|
src/audio/WaveFileReader.cpp src/audio/WaveFileReader.h
|
||||||
src/audio/waveFileWriting.cpp src/audio/waveFileWriting.h
|
src/audio/waveFileWriting.cpp src/audio/waveFileWriting.h
|
||||||
|
|
|
@ -0,0 +1,65 @@
|
||||||
|
#include "AudioClip.h"
|
||||||
|
#include <format.h>
|
||||||
|
|
||||||
|
using std::invalid_argument;
|
||||||
|
|
||||||
|
TimeRange AudioClip::getTruncatedRange() const {
|
||||||
|
return TimeRange(0cs, centiseconds(100 * size() / getSampleRate()));
|
||||||
|
}
|
||||||
|
|
||||||
|
class SafeSampleReader {
|
||||||
|
public:
|
||||||
|
SafeSampleReader(SampleReader unsafeRead, AudioClip::size_type size);
|
||||||
|
AudioClip::value_type operator()(AudioClip::size_type index);
|
||||||
|
private:
|
||||||
|
SampleReader unsafeRead;
|
||||||
|
AudioClip::size_type size;
|
||||||
|
AudioClip::size_type lastIndex = -1;
|
||||||
|
AudioClip::value_type lastSample = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
SafeSampleReader::SafeSampleReader(SampleReader unsafeRead, AudioClip::size_type size) :
|
||||||
|
unsafeRead(unsafeRead),
|
||||||
|
size(size)
|
||||||
|
{}
|
||||||
|
|
||||||
|
inline AudioClip::value_type SafeSampleReader::operator()(AudioClip::size_type index) {
|
||||||
|
if (index < 0) {
|
||||||
|
throw invalid_argument(fmt::format("Cannot read from sample index {}. Index < 0.", index));
|
||||||
|
}
|
||||||
|
if (index >= size) {
|
||||||
|
throw invalid_argument(fmt::format("Cannot read from sample index {}. Clip size is {}.", index, size));
|
||||||
|
}
|
||||||
|
if (index == lastIndex) {
|
||||||
|
return lastSample;
|
||||||
|
}
|
||||||
|
|
||||||
|
lastIndex = index;
|
||||||
|
lastSample = unsafeRead(index);
|
||||||
|
return lastSample;
|
||||||
|
}
|
||||||
|
|
||||||
|
SampleReader AudioClip::createSampleReader() const {
|
||||||
|
return SafeSampleReader(createUnsafeSampleReader(), size());
|
||||||
|
}
|
||||||
|
|
||||||
|
AudioClip::iterator AudioClip::begin() const {
|
||||||
|
return SampleIterator(*this, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
AudioClip::iterator AudioClip::end() const {
|
||||||
|
return SampleIterator(*this, size());
|
||||||
|
}
|
||||||
|
|
||||||
|
std::unique_ptr<AudioClip> operator|(std::unique_ptr<AudioClip> clip, AudioEffect effect) {
|
||||||
|
return effect(std::move(clip));
|
||||||
|
}
|
||||||
|
|
||||||
|
SampleIterator::SampleIterator() :
|
||||||
|
sampleIndex(0)
|
||||||
|
{}
|
||||||
|
|
||||||
|
SampleIterator::SampleIterator(const AudioClip& audioClip, size_type sampleIndex) :
|
||||||
|
sampleReader([&audioClip] { return audioClip.createSampleReader(); }),
|
||||||
|
sampleIndex(sampleIndex)
|
||||||
|
{}
|
|
@ -0,0 +1,141 @@
|
||||||
|
#pragma once
|
||||||
|
#include <memory>
|
||||||
|
#include "TimeRange.h"
|
||||||
|
#include <functional>
|
||||||
|
#include "Lazy.h"
|
||||||
|
|
||||||
|
class AudioClip;
|
||||||
|
class SampleIterator;
|
||||||
|
|
||||||
|
class AudioClip {
|
||||||
|
public:
|
||||||
|
using value_type = float;
|
||||||
|
using size_type = int64_t;
|
||||||
|
using difference_type = int64_t;
|
||||||
|
using iterator = SampleIterator;
|
||||||
|
using SampleReader = std::function<value_type(size_type)>;
|
||||||
|
|
||||||
|
virtual ~AudioClip() {}
|
||||||
|
virtual std::unique_ptr<AudioClip> clone() const = 0;
|
||||||
|
virtual int getSampleRate() const = 0;
|
||||||
|
virtual size_type size() const = 0;
|
||||||
|
TimeRange getTruncatedRange() const;
|
||||||
|
SampleReader createSampleReader() const;
|
||||||
|
iterator begin() const;
|
||||||
|
iterator end() const;
|
||||||
|
private:
|
||||||
|
virtual SampleReader createUnsafeSampleReader() const = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
using AudioEffect = std::function<std::unique_ptr<AudioClip>(std::unique_ptr<AudioClip>)>;
|
||||||
|
|
||||||
|
std::unique_ptr<AudioClip> operator|(std::unique_ptr<AudioClip> clip, AudioEffect effect);
|
||||||
|
|
||||||
|
using SampleReader = AudioClip::SampleReader;
|
||||||
|
|
||||||
|
class SampleIterator {
|
||||||
|
public:
|
||||||
|
using value_type = AudioClip::value_type;
|
||||||
|
using size_type = AudioClip::size_type;
|
||||||
|
using difference_type = AudioClip::difference_type;
|
||||||
|
|
||||||
|
SampleIterator();
|
||||||
|
|
||||||
|
size_type getSampleIndex() const;
|
||||||
|
void seek(size_type sampleIndex);
|
||||||
|
value_type operator*() const;
|
||||||
|
value_type operator[](difference_type n) const;
|
||||||
|
|
||||||
|
private:
|
||||||
|
friend AudioClip;
|
||||||
|
SampleIterator(const AudioClip& audioClip, size_type sampleIndex);
|
||||||
|
|
||||||
|
Lazy<SampleReader> sampleReader;
|
||||||
|
size_type sampleIndex;
|
||||||
|
};
|
||||||
|
|
||||||
|
inline SampleIterator::size_type SampleIterator::getSampleIndex() const {
|
||||||
|
return sampleIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void SampleIterator::seek(size_type sampleIndex) {
|
||||||
|
this->sampleIndex = sampleIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline SampleIterator::value_type SampleIterator::operator*() const {
|
||||||
|
return (*sampleReader)(sampleIndex);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline SampleIterator::value_type SampleIterator::operator[](difference_type n) const {
|
||||||
|
return (*sampleReader)(sampleIndex + n);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline bool operator==(const SampleIterator& lhs, const SampleIterator& rhs) {
|
||||||
|
return lhs.getSampleIndex() == rhs.getSampleIndex();
|
||||||
|
}
|
||||||
|
|
||||||
|
inline bool operator!=(const SampleIterator& lhs, const SampleIterator& rhs) {
|
||||||
|
return lhs.getSampleIndex() != rhs.getSampleIndex();
|
||||||
|
}
|
||||||
|
|
||||||
|
inline bool operator<(const SampleIterator& lhs, const SampleIterator& rhs) {
|
||||||
|
return lhs.getSampleIndex() < rhs.getSampleIndex();
|
||||||
|
}
|
||||||
|
|
||||||
|
inline bool operator>(const SampleIterator& lhs, const SampleIterator& rhs) {
|
||||||
|
return lhs.getSampleIndex() > rhs.getSampleIndex();
|
||||||
|
}
|
||||||
|
|
||||||
|
inline bool operator<=(const SampleIterator& lhs, const SampleIterator& rhs) {
|
||||||
|
return lhs.getSampleIndex() <= rhs.getSampleIndex();
|
||||||
|
}
|
||||||
|
|
||||||
|
inline bool operator>=(const SampleIterator& lhs, const SampleIterator& rhs) {
|
||||||
|
return lhs.getSampleIndex() >= rhs.getSampleIndex();
|
||||||
|
}
|
||||||
|
|
||||||
|
inline SampleIterator& operator+=(SampleIterator& it, SampleIterator::difference_type n) {
|
||||||
|
it.seek(it.getSampleIndex() + n);
|
||||||
|
return it;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline SampleIterator& operator-=(SampleIterator& it, SampleIterator::difference_type n) {
|
||||||
|
it.seek(it.getSampleIndex() - n);
|
||||||
|
return it;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline SampleIterator& operator++(SampleIterator& it) {
|
||||||
|
return operator+=(it, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline SampleIterator operator++(SampleIterator& it, int) {
|
||||||
|
SampleIterator tmp(it);
|
||||||
|
operator++(it);
|
||||||
|
return tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline SampleIterator& operator--(SampleIterator& it) {
|
||||||
|
return operator-=(it, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline SampleIterator operator--(SampleIterator& it, int) {
|
||||||
|
SampleIterator tmp(it);
|
||||||
|
operator--(it);
|
||||||
|
return tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline SampleIterator operator+(const SampleIterator& it, SampleIterator::difference_type n) {
|
||||||
|
SampleIterator result(it);
|
||||||
|
result += n;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline SampleIterator operator-(const SampleIterator& it, SampleIterator::difference_type n) {
|
||||||
|
SampleIterator result(it);
|
||||||
|
result -= n;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline SampleIterator::difference_type operator-(const SampleIterator& lhs, const SampleIterator& rhs) {
|
||||||
|
return lhs.getSampleIndex() - rhs.getSampleIndex();
|
||||||
|
}
|
|
@ -0,0 +1,30 @@
|
||||||
|
#include "AudioSegment.h"
|
||||||
|
|
||||||
|
using std::unique_ptr;
|
||||||
|
using std::make_unique;
|
||||||
|
|
||||||
|
AudioSegment::AudioSegment(std::unique_ptr<AudioClip> inputClip, const TimeRange& range) :
|
||||||
|
inputClip(std::move(inputClip)),
|
||||||
|
sampleOffset(static_cast<int64_t>(range.getStart().count()) * this->inputClip->getSampleRate() / 100),
|
||||||
|
sampleCount(static_cast<int64_t>(range.getLength().count()) * this->inputClip->getSampleRate() / 100)
|
||||||
|
{
|
||||||
|
if (sampleOffset < 0 || sampleOffset + sampleCount > this->inputClip->size()) {
|
||||||
|
throw std::invalid_argument("Segment extends beyond input clip.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
unique_ptr<AudioClip> AudioSegment::clone() const {
|
||||||
|
return make_unique<AudioSegment>(*this);
|
||||||
|
}
|
||||||
|
|
||||||
|
SampleReader AudioSegment::createUnsafeSampleReader() const {
|
||||||
|
return [read = inputClip->createSampleReader(), sampleOffset = sampleOffset](size_type index) {
|
||||||
|
return read(index + sampleOffset);
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
AudioEffect segment(const TimeRange& range) {
|
||||||
|
return [range](unique_ptr<AudioClip> inputClip) {
|
||||||
|
return make_unique<AudioSegment>(std::move(inputClip), range);
|
||||||
|
};
|
||||||
|
}
|
|
@ -0,0 +1,26 @@
|
||||||
|
#pragma once
|
||||||
|
#include "AudioClip.h"
|
||||||
|
|
||||||
|
class AudioSegment : public AudioClip {
|
||||||
|
public:
|
||||||
|
AudioSegment(std::unique_ptr<AudioClip> inputClip, const TimeRange& range);
|
||||||
|
std::unique_ptr<AudioClip> clone() const override;
|
||||||
|
int getSampleRate() const override;
|
||||||
|
size_type size() const override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
SampleReader createUnsafeSampleReader() const override;
|
||||||
|
|
||||||
|
std::shared_ptr<AudioClip> inputClip;
|
||||||
|
size_type sampleOffset, sampleCount;
|
||||||
|
};
|
||||||
|
|
||||||
|
inline int AudioSegment::getSampleRate() const {
|
||||||
|
return inputClip->getSampleRate();
|
||||||
|
}
|
||||||
|
|
||||||
|
inline AudioClip::size_type AudioSegment::size() const {
|
||||||
|
return sampleCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
AudioEffect segment(const TimeRange& range);
|
|
@ -1,9 +0,0 @@
|
||||||
#include "AudioStream.h"
|
|
||||||
|
|
||||||
TimeRange AudioStream::getTruncatedRange() const {
|
|
||||||
return TimeRange(0cs, centiseconds(100 * getSampleCount() / getSampleRate()));
|
|
||||||
}
|
|
||||||
|
|
||||||
bool AudioStream::endOfStream() const {
|
|
||||||
return getSampleIndex() >= getSampleCount();
|
|
||||||
}
|
|
|
@ -1,18 +0,0 @@
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include <memory>
|
|
||||||
#include "TimeRange.h"
|
|
||||||
|
|
||||||
// A mono stream of floating-point samples.
|
|
||||||
class AudioStream {
|
|
||||||
public:
|
|
||||||
virtual ~AudioStream() {}
|
|
||||||
virtual std::unique_ptr<AudioStream> clone(bool reset) const = 0;
|
|
||||||
virtual int getSampleRate() const = 0;
|
|
||||||
virtual int64_t getSampleCount() const = 0;
|
|
||||||
TimeRange getTruncatedRange() const;
|
|
||||||
virtual int64_t getSampleIndex() const = 0;
|
|
||||||
virtual void seek(int64_t sampleIndex) = 0;
|
|
||||||
bool endOfStream() const;
|
|
||||||
virtual float readSample() = 0;
|
|
||||||
};
|
|
|
@ -1,50 +0,0 @@
|
||||||
#include "AudioStreamSegment.h"
|
|
||||||
#include <stdexcept>
|
|
||||||
|
|
||||||
AudioStreamSegment::AudioStreamSegment(std::unique_ptr<AudioStream> audioStream, const TimeRange& range) :
|
|
||||||
audioStream(std::move(audioStream)),
|
|
||||||
sampleOffset(static_cast<int64_t>(range.getStart().count()) * this->audioStream->getSampleRate() / 100),
|
|
||||||
sampleCount(static_cast<int64_t>(range.getLength().count()) * this->audioStream->getSampleRate() / 100)
|
|
||||||
{
|
|
||||||
seek(0);
|
|
||||||
|
|
||||||
if (sampleOffset < 0 || sampleOffset + sampleCount > this->audioStream->getSampleCount()) {
|
|
||||||
throw std::invalid_argument("Segment extends beyond input stream.");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
AudioStreamSegment::AudioStreamSegment(const AudioStreamSegment& rhs, bool reset) :
|
|
||||||
audioStream(rhs.audioStream->clone(false)),
|
|
||||||
sampleOffset(rhs.sampleOffset),
|
|
||||||
sampleCount(rhs.sampleCount)
|
|
||||||
{
|
|
||||||
if (reset) seek(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::unique_ptr<AudioStream> AudioStreamSegment::clone(bool reset) const {
|
|
||||||
return std::make_unique<AudioStreamSegment>(*this, reset);
|
|
||||||
}
|
|
||||||
|
|
||||||
int AudioStreamSegment::getSampleRate() const {
|
|
||||||
return audioStream->getSampleRate();
|
|
||||||
}
|
|
||||||
|
|
||||||
int64_t AudioStreamSegment::getSampleCount() const {
|
|
||||||
return sampleCount;
|
|
||||||
}
|
|
||||||
|
|
||||||
int64_t AudioStreamSegment::getSampleIndex() const {
|
|
||||||
return audioStream->getSampleIndex() - sampleOffset;
|
|
||||||
}
|
|
||||||
|
|
||||||
void AudioStreamSegment::seek(int64_t sampleIndex) {
|
|
||||||
audioStream->seek(sampleIndex + sampleOffset);
|
|
||||||
}
|
|
||||||
|
|
||||||
float AudioStreamSegment::readSample() {
|
|
||||||
return audioStream->readSample();
|
|
||||||
}
|
|
||||||
|
|
||||||
std::unique_ptr<AudioStream> createSegment(std::unique_ptr<AudioStream> audioStream, const TimeRange& range) {
|
|
||||||
return std::make_unique<AudioStreamSegment>(std::move(audioStream), range);
|
|
||||||
}
|
|
|
@ -1,21 +0,0 @@
|
||||||
#pragma once
|
|
||||||
#include <audio/AudioStream.h>
|
|
||||||
#include <TimeRange.h>
|
|
||||||
|
|
||||||
class AudioStreamSegment : public AudioStream {
|
|
||||||
public:
|
|
||||||
AudioStreamSegment(std::unique_ptr<AudioStream> audioStream, const TimeRange& range);
|
|
||||||
AudioStreamSegment(const AudioStreamSegment& rhs, bool reset);
|
|
||||||
std::unique_ptr<AudioStream> clone(bool reset) const override;
|
|
||||||
int getSampleRate() const override;
|
|
||||||
int64_t getSampleCount() const override;
|
|
||||||
int64_t getSampleIndex() const override;
|
|
||||||
void seek(int64_t sampleIndex) override;
|
|
||||||
float readSample() override;
|
|
||||||
|
|
||||||
private:
|
|
||||||
std::unique_ptr<AudioStream> audioStream;
|
|
||||||
const int64_t sampleOffset, sampleCount;
|
|
||||||
};
|
|
||||||
|
|
||||||
std::unique_ptr<AudioStream> createSegment(std::unique_ptr<AudioStream> audioStream, const TimeRange& range);
|
|
|
@ -1,73 +1,46 @@
|
||||||
#include "DCOffset.h"
|
#include "DCOffset.h"
|
||||||
#include <gsl_util.h>
|
|
||||||
#include <cmath>
|
|
||||||
|
|
||||||
DCOffset::DCOffset(std::unique_ptr<AudioStream> inputStream, float offset) :
|
using std::unique_ptr;
|
||||||
inputStream(std::move(inputStream)),
|
using std::make_unique;
|
||||||
|
|
||||||
|
DCOffset::DCOffset(unique_ptr<AudioClip> inputClip, float offset) :
|
||||||
|
inputClip(std::move(inputClip)),
|
||||||
offset(offset),
|
offset(offset),
|
||||||
factor(1 / (1 + std::abs(offset)))
|
factor(1 / (1 + std::abs(offset)))
|
||||||
{}
|
{}
|
||||||
|
|
||||||
DCOffset::DCOffset(const DCOffset& rhs, bool reset) :
|
unique_ptr<AudioClip> DCOffset::clone() const {
|
||||||
inputStream(rhs.inputStream->clone(reset)),
|
return make_unique<DCOffset>(*this);
|
||||||
offset(rhs.offset),
|
|
||||||
factor(rhs.factor)
|
|
||||||
{}
|
|
||||||
|
|
||||||
std::unique_ptr<AudioStream> DCOffset::clone(bool reset) const {
|
|
||||||
return std::make_unique<DCOffset>(*this, reset);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int DCOffset::getSampleRate() const {
|
SampleReader DCOffset::createUnsafeSampleReader() const {
|
||||||
return inputStream->getSampleRate();
|
return [read = inputClip->createSampleReader(), factor = factor, offset = offset](size_type index) {
|
||||||
|
float sample = read(index);
|
||||||
|
return sample * factor + offset;
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
int64_t DCOffset::getSampleCount() const {
|
float getDCOffset(const AudioClip& audioClip) {
|
||||||
return inputStream->getSampleCount();
|
|
||||||
}
|
|
||||||
|
|
||||||
int64_t DCOffset::getSampleIndex() const {
|
|
||||||
return inputStream->getSampleIndex();
|
|
||||||
}
|
|
||||||
|
|
||||||
void DCOffset::seek(int64_t sampleIndex) {
|
|
||||||
inputStream->seek(sampleIndex);
|
|
||||||
}
|
|
||||||
|
|
||||||
float DCOffset::readSample() {
|
|
||||||
float sample = inputStream->readSample();
|
|
||||||
return sample * factor + offset;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::unique_ptr<AudioStream> addDCOffset(std::unique_ptr<AudioStream> audioStream, float offset, float epsilon) {
|
|
||||||
if (std::abs(offset) < epsilon) return audioStream;
|
|
||||||
return std::make_unique<DCOffset>(std::move(audioStream), offset);
|
|
||||||
}
|
|
||||||
|
|
||||||
float getDCOffset(AudioStream& audioStream) {
|
|
||||||
int flatMeanSampleCount, fadingMeanSampleCount;
|
int flatMeanSampleCount, fadingMeanSampleCount;
|
||||||
int sampleRate = audioStream.getSampleRate();
|
int sampleRate = audioClip.getSampleRate();
|
||||||
if (audioStream.getSampleCount() > 4 * sampleRate) {
|
if (audioClip.size() > 4 * sampleRate) {
|
||||||
// Long audio file. Average over the first 3 seconds, then fade out over the 4th.
|
// Long audio file. Average over the first 3 seconds, then fade out over the 4th.
|
||||||
flatMeanSampleCount = 3 * sampleRate;
|
flatMeanSampleCount = 3 * sampleRate;
|
||||||
fadingMeanSampleCount = 1 * sampleRate;
|
fadingMeanSampleCount = 1 * sampleRate;
|
||||||
} else {
|
} else {
|
||||||
// Short audio file. Average over the entire length.
|
// Short audio file. Average over the entire length.
|
||||||
flatMeanSampleCount = static_cast<int>(audioStream.getSampleCount());
|
flatMeanSampleCount = static_cast<int>(audioClip.size());
|
||||||
fadingMeanSampleCount = 0;
|
fadingMeanSampleCount = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int64_t originalSampleIndex = audioStream.getSampleIndex();
|
auto read = audioClip.createSampleReader();
|
||||||
audioStream.seek(0);
|
|
||||||
auto restorePosition = gsl::finally([&]() { audioStream.seek(originalSampleIndex); });
|
|
||||||
|
|
||||||
double sum = 0;
|
double sum = 0;
|
||||||
for (int i = 0; i < flatMeanSampleCount; i++) {
|
for (int i = 0; i < flatMeanSampleCount; ++i) {
|
||||||
sum += audioStream.readSample();
|
sum += read(i);
|
||||||
}
|
}
|
||||||
for (int i = 0; i < fadingMeanSampleCount; i++) {
|
for (int i = 0; i < fadingMeanSampleCount; ++i) {
|
||||||
double weight = static_cast<double>(fadingMeanSampleCount - i) / fadingMeanSampleCount;
|
double weight = static_cast<double>(fadingMeanSampleCount - i) / fadingMeanSampleCount;
|
||||||
sum += audioStream.readSample() * weight;
|
sum += read(flatMeanSampleCount + i) * weight;
|
||||||
}
|
}
|
||||||
|
|
||||||
double totalWeight = flatMeanSampleCount + (fadingMeanSampleCount + 1) / 2.0;
|
double totalWeight = flatMeanSampleCount + (fadingMeanSampleCount + 1) / 2.0;
|
||||||
|
@ -75,7 +48,16 @@ float getDCOffset(AudioStream& audioStream) {
|
||||||
return static_cast<float>(offset);
|
return static_cast<float>(offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::unique_ptr<AudioStream> removeDCOffset(std::unique_ptr<AudioStream> inputStream) {
|
AudioEffect addDCOffset(float offset, float epsilon) {
|
||||||
float offset = getDCOffset(*inputStream.get());
|
return [offset, epsilon](unique_ptr<AudioClip> inputClip) -> unique_ptr<AudioClip> {
|
||||||
return addDCOffset(std::move(inputStream), -offset);
|
if (std::abs(offset) < epsilon) return std::move(inputClip);
|
||||||
|
return make_unique<DCOffset>(std::move(inputClip), offset);
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
AudioEffect removeDCOffset(float epsilon) {
|
||||||
|
return [epsilon](unique_ptr<AudioClip> inputClip) {
|
||||||
|
float offset = getDCOffset(*inputClip);
|
||||||
|
return std::move(inputClip) | addDCOffset(-offset, epsilon);
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,26 +1,32 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "AudioStream.h"
|
#include "AudioClip.h"
|
||||||
|
|
||||||
// Applies a constant DC offset to an audio stream and reduces its amplitude
|
// Applies a constant DC offset to an audio clip and reduces its amplitude
|
||||||
// to prevent clipping
|
// to prevent clipping
|
||||||
class DCOffset : public AudioStream {
|
class DCOffset : public AudioClip {
|
||||||
public:
|
public:
|
||||||
DCOffset(std::unique_ptr<AudioStream> inputStream, float offset);
|
DCOffset(std::unique_ptr<AudioClip> inputClip, float offset);
|
||||||
DCOffset(const DCOffset& rhs, bool reset);
|
std::unique_ptr<AudioClip> clone() const override;
|
||||||
std::unique_ptr<AudioStream> clone(bool reset) const override;
|
|
||||||
int getSampleRate() const override;
|
int getSampleRate() const override;
|
||||||
int64_t getSampleCount() const override;
|
size_type size() const override;
|
||||||
int64_t getSampleIndex() const override;
|
|
||||||
void seek(int64_t sampleIndex) override;
|
|
||||||
float readSample() override;
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::unique_ptr<AudioStream> inputStream;
|
SampleReader createUnsafeSampleReader() const override;
|
||||||
|
|
||||||
|
std::shared_ptr<AudioClip> inputClip;
|
||||||
float offset;
|
float offset;
|
||||||
float factor;
|
float factor;
|
||||||
};
|
};
|
||||||
|
|
||||||
std::unique_ptr<AudioStream> addDCOffset(std::unique_ptr<AudioStream> audioStream, float offset, float epsilon = 1.0f / 15000);
|
inline int DCOffset::getSampleRate() const {
|
||||||
|
return inputClip->getSampleRate();
|
||||||
|
}
|
||||||
|
|
||||||
std::unique_ptr<AudioStream> removeDCOffset(std::unique_ptr<AudioStream> audioStream);
|
inline AudioClip::size_type DCOffset::size() const {
|
||||||
|
return inputClip->size();
|
||||||
|
}
|
||||||
|
|
||||||
|
float getDCOffset(const AudioClip& audioClip);
|
||||||
|
|
||||||
|
AudioEffect addDCOffset(float offset, float epsilon = 1.0f / 15000);
|
||||||
|
AudioEffect removeDCOffset(float epsilon = 1.0f / 15000);
|
||||||
|
|
|
@ -1,105 +1,62 @@
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include "SampleRateConverter.h"
|
#include "SampleRateConverter.h"
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
#include <algorithm>
|
|
||||||
#include <format.h>
|
#include <format.h>
|
||||||
|
|
||||||
using std::invalid_argument;
|
using std::invalid_argument;
|
||||||
|
using std::unique_ptr;
|
||||||
|
using std::make_unique;
|
||||||
|
|
||||||
SampleRateConverter::SampleRateConverter(std::unique_ptr<AudioStream> inputStream, int outputSampleRate) :
|
SampleRateConverter::SampleRateConverter(unique_ptr<AudioClip> inputClip, int outputSampleRate) :
|
||||||
inputStream(std::move(inputStream)),
|
inputClip(std::move(inputClip)),
|
||||||
downscalingFactor(static_cast<double>(this->inputStream->getSampleRate()) / outputSampleRate),
|
downscalingFactor(static_cast<double>(this->inputClip->getSampleRate()) / outputSampleRate),
|
||||||
outputSampleRate(outputSampleRate),
|
outputSampleRate(outputSampleRate),
|
||||||
outputSampleCount(std::lround(this->inputStream->getSampleCount() / downscalingFactor)),
|
outputSampleCount(std::lround(this->inputClip->size() / downscalingFactor))
|
||||||
lastInputSample(0),
|
|
||||||
lastInputSampleIndex(-1),
|
|
||||||
nextOutputSampleIndex(0)
|
|
||||||
{
|
{
|
||||||
if (outputSampleRate <= 0) {
|
if (outputSampleRate <= 0) {
|
||||||
throw invalid_argument("Sample rate must be positive.");
|
throw invalid_argument("Sample rate must be positive.");
|
||||||
}
|
}
|
||||||
if (this->inputStream->getSampleRate() < outputSampleRate) {
|
if (this->inputClip->getSampleRate() < outputSampleRate) {
|
||||||
throw invalid_argument(fmt::format("Upsampling not supported. Audio sample rate must not be below {}Hz.", outputSampleRate));
|
throw invalid_argument(fmt::format("Upsampling not supported. Input sample rate must not be below {}Hz.", outputSampleRate));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
SampleRateConverter::SampleRateConverter(const SampleRateConverter& rhs, bool reset) :
|
unique_ptr<AudioClip> SampleRateConverter::clone() const {
|
||||||
SampleRateConverter(rhs.inputStream->clone(reset), rhs.outputSampleRate)
|
return make_unique<SampleRateConverter>(*this);
|
||||||
{
|
|
||||||
nextOutputSampleIndex = reset ? 0 : rhs.nextOutputSampleIndex;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::unique_ptr<AudioStream> SampleRateConverter::clone(bool reset) const {
|
float mean(double inputStart, double inputEnd, const SampleReader& read) {
|
||||||
return std::make_unique<SampleRateConverter>(*this, reset);
|
|
||||||
}
|
|
||||||
|
|
||||||
int SampleRateConverter::getSampleRate() const {
|
|
||||||
return outputSampleRate;
|
|
||||||
}
|
|
||||||
|
|
||||||
int64_t SampleRateConverter::getSampleCount() const {
|
|
||||||
return outputSampleCount;
|
|
||||||
}
|
|
||||||
|
|
||||||
int64_t SampleRateConverter::getSampleIndex() const {
|
|
||||||
return nextOutputSampleIndex;
|
|
||||||
}
|
|
||||||
|
|
||||||
void SampleRateConverter::seek(int64_t sampleIndex) {
|
|
||||||
if (sampleIndex < 0 || sampleIndex >= outputSampleCount) throw std::invalid_argument("sampleIndex out of range.");
|
|
||||||
|
|
||||||
nextOutputSampleIndex = sampleIndex;
|
|
||||||
}
|
|
||||||
|
|
||||||
float SampleRateConverter::readSample() {
|
|
||||||
if (nextOutputSampleIndex >= outputSampleCount) throw std::out_of_range("End of stream.");
|
|
||||||
|
|
||||||
double inputStart = nextOutputSampleIndex * downscalingFactor;
|
|
||||||
double inputEnd = (nextOutputSampleIndex + 1) * downscalingFactor;
|
|
||||||
|
|
||||||
nextOutputSampleIndex++;
|
|
||||||
return mean(inputStart, inputEnd);
|
|
||||||
}
|
|
||||||
|
|
||||||
float SampleRateConverter::mean(double inputStart, double inputEnd) {
|
|
||||||
// Calculate weighted sum...
|
// Calculate weighted sum...
|
||||||
double sum = 0;
|
double sum = 0;
|
||||||
|
|
||||||
// ... first sample (weight <= 1)
|
// ... first sample (weight <= 1)
|
||||||
int64_t startIndex = static_cast<int64_t>(inputStart);
|
int64_t startIndex = static_cast<int64_t>(inputStart);
|
||||||
sum += getInputSample(startIndex) * ((startIndex + 1) - inputStart);
|
sum += read(startIndex) * ((startIndex + 1) - inputStart);
|
||||||
|
|
||||||
// ... middle samples (weight 1 each)
|
// ... middle samples (weight 1 each)
|
||||||
int64_t endIndex = static_cast<int64_t>(inputEnd);
|
int64_t endIndex = static_cast<int64_t>(inputEnd);
|
||||||
for (int64_t index = startIndex + 1; index < endIndex; ++index) {
|
for (int64_t index = startIndex + 1; index < endIndex; ++index) {
|
||||||
sum += getInputSample(index);
|
sum += read(index);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ... last sample (weight < 1)
|
// ... last sample (weight < 1)
|
||||||
sum += getInputSample(endIndex) * (inputEnd - endIndex);
|
if (endIndex < inputEnd) {
|
||||||
|
sum += read(endIndex) * (inputEnd - endIndex);
|
||||||
|
}
|
||||||
|
|
||||||
return static_cast<float>(sum / (inputEnd - inputStart));
|
return static_cast<float>(sum / (inputEnd - inputStart));
|
||||||
}
|
}
|
||||||
|
|
||||||
float SampleRateConverter::getInputSample(int64_t sampleIndex) {
|
SampleReader SampleRateConverter::createUnsafeSampleReader() const {
|
||||||
sampleIndex = std::min(sampleIndex, inputStream->getSampleCount() - 1);
|
return[read = inputClip->createSampleReader(), downscalingFactor = downscalingFactor, size = size()](size_type index) {
|
||||||
if (sampleIndex < 0) return 0.0f;
|
double inputStart = index * downscalingFactor;
|
||||||
|
double inputEnd = std::min((index + 1) * downscalingFactor, static_cast<double>(size));
|
||||||
if (sampleIndex == lastInputSampleIndex) {
|
return mean(inputStart, inputEnd, read);
|
||||||
return lastInputSample;
|
};
|
||||||
}
|
|
||||||
|
|
||||||
if (sampleIndex != inputStream->getSampleIndex()) {
|
|
||||||
inputStream->seek(sampleIndex);
|
|
||||||
}
|
|
||||||
lastInputSample = inputStream->readSample();
|
|
||||||
lastInputSampleIndex = sampleIndex;
|
|
||||||
return lastInputSample;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::unique_ptr<AudioStream> convertSampleRate(std::unique_ptr<AudioStream> audioStream, int sampleRate) {
|
AudioEffect resample(int sampleRate) {
|
||||||
if (sampleRate == audioStream->getSampleRate()) {
|
return [sampleRate](unique_ptr<AudioClip> inputClip) {
|
||||||
return audioStream;
|
return make_unique<SampleRateConverter>(std::move(inputClip), sampleRate);
|
||||||
}
|
};
|
||||||
return std::make_unique<SampleRateConverter>(std::move(audioStream), sampleRate);
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,32 +1,29 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include "AudioStream.h"
|
#include "AudioClip.h"
|
||||||
|
|
||||||
class SampleRateConverter : public AudioStream {
|
class SampleRateConverter : public AudioClip {
|
||||||
public:
|
public:
|
||||||
SampleRateConverter(std::unique_ptr<AudioStream> inputStream, int outputSampleRate);
|
SampleRateConverter(std::unique_ptr<AudioClip> inputClip, int outputSampleRate);
|
||||||
SampleRateConverter(const SampleRateConverter& rhs, bool reset);
|
std::unique_ptr<AudioClip> clone() const override;
|
||||||
std::unique_ptr<AudioStream> clone(bool reset) const override;
|
|
||||||
int getSampleRate() const override;
|
int getSampleRate() const override;
|
||||||
int64_t getSampleCount() const override;
|
size_type size() const override;
|
||||||
int64_t getSampleIndex() const override;
|
|
||||||
void seek(int64_t sampleIndex) override;
|
|
||||||
float readSample() override;
|
|
||||||
private:
|
private:
|
||||||
std::unique_ptr<AudioStream> inputStream;
|
SampleReader createUnsafeSampleReader() const override;
|
||||||
double downscalingFactor; // input sample rate / output sample rate
|
|
||||||
|
|
||||||
|
std::shared_ptr<AudioClip> inputClip;
|
||||||
|
double downscalingFactor; // input sample rate / output sample rate
|
||||||
int outputSampleRate;
|
int outputSampleRate;
|
||||||
int64_t outputSampleCount;
|
int64_t outputSampleCount;
|
||||||
|
|
||||||
float lastInputSample;
|
|
||||||
int64_t lastInputSampleIndex;
|
|
||||||
|
|
||||||
int64_t nextOutputSampleIndex;
|
|
||||||
|
|
||||||
float mean(double start, double end);
|
|
||||||
float getInputSample(int64_t sampleIndex);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
std::unique_ptr<AudioStream> convertSampleRate(std::unique_ptr<AudioStream> audioStream, int sampleRate);
|
AudioEffect resample(int sampleRate);
|
||||||
|
|
||||||
|
inline int SampleRateConverter::getSampleRate() const {
|
||||||
|
return outputSampleRate;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline AudioClip::size_type SampleRateConverter::size() const {
|
||||||
|
return outputSampleCount;
|
||||||
|
}
|
||||||
|
|
|
@ -1,59 +0,0 @@
|
||||||
#include "UnboundedStream.h"
|
|
||||||
|
|
||||||
using boost::optional;
|
|
||||||
|
|
||||||
UnboundedStream::UnboundedStream(std::unique_ptr<AudioStream> inputStream) :
|
|
||||||
innerStream(std::move(inputStream)),
|
|
||||||
sampleIndex(innerStream->getSampleIndex()),
|
|
||||||
firstSample(inputStream->getSampleCount() ? optional<float>() : 0.0f),
|
|
||||||
lastSample(inputStream->getSampleCount() ? optional<float>() : 0.0f)
|
|
||||||
{}
|
|
||||||
|
|
||||||
UnboundedStream::UnboundedStream(const UnboundedStream& rhs, bool reset) :
|
|
||||||
innerStream(rhs.innerStream->clone(reset)),
|
|
||||||
sampleIndex(rhs.sampleIndex),
|
|
||||||
firstSample(rhs.firstSample),
|
|
||||||
lastSample(rhs.lastSample)
|
|
||||||
{}
|
|
||||||
|
|
||||||
std::unique_ptr<AudioStream> UnboundedStream::clone(bool reset) const {
|
|
||||||
return std::make_unique<UnboundedStream>(*this, reset);
|
|
||||||
}
|
|
||||||
|
|
||||||
int UnboundedStream::getSampleRate() const {
|
|
||||||
return innerStream->getSampleRate();
|
|
||||||
}
|
|
||||||
|
|
||||||
int64_t UnboundedStream::getSampleCount() const {
|
|
||||||
return innerStream->getSampleCount();
|
|
||||||
}
|
|
||||||
|
|
||||||
int64_t UnboundedStream::getSampleIndex() const {
|
|
||||||
return sampleIndex;
|
|
||||||
}
|
|
||||||
|
|
||||||
void UnboundedStream::seek(int64_t sampleIndex) {
|
|
||||||
this->sampleIndex = sampleIndex;
|
|
||||||
}
|
|
||||||
|
|
||||||
float UnboundedStream::readSample() {
|
|
||||||
if (sampleIndex < 0) {
|
|
||||||
if (!firstSample) {
|
|
||||||
innerStream->seek(0);
|
|
||||||
firstSample = innerStream->readSample();
|
|
||||||
}
|
|
||||||
return firstSample.get();
|
|
||||||
}
|
|
||||||
if (sampleIndex >= innerStream->getSampleCount()) {
|
|
||||||
if (!lastSample) {
|
|
||||||
innerStream->seek(innerStream->getSampleCount() - 1);
|
|
||||||
lastSample = innerStream->readSample();
|
|
||||||
}
|
|
||||||
return lastSample.get();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (sampleIndex != innerStream->getSampleIndex()) {
|
|
||||||
innerStream->seek(sampleIndex);
|
|
||||||
}
|
|
||||||
return innerStream->readSample();
|
|
||||||
}
|
|
|
@ -1,22 +0,0 @@
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include "AudioStream.h"
|
|
||||||
#include <boost/optional/optional.hpp>
|
|
||||||
|
|
||||||
// Stream wrapper that allows reading before the start and past the end of the input stream.
|
|
||||||
class UnboundedStream : public AudioStream {
|
|
||||||
public:
|
|
||||||
UnboundedStream(std::unique_ptr<AudioStream> inputStream);
|
|
||||||
UnboundedStream(const UnboundedStream& rhs, bool reset);
|
|
||||||
std::unique_ptr<AudioStream> clone(bool reset) const override;
|
|
||||||
int getSampleRate() const override;
|
|
||||||
int64_t getSampleCount() const override;
|
|
||||||
int64_t getSampleIndex() const override;
|
|
||||||
void seek(int64_t sampleIndex) override;
|
|
||||||
float readSample() override;
|
|
||||||
|
|
||||||
private:
|
|
||||||
std::unique_ptr<AudioStream> innerStream;
|
|
||||||
int64_t sampleIndex;
|
|
||||||
boost::optional<float> firstSample, lastSample;
|
|
||||||
};
|
|
|
@ -7,6 +7,10 @@ using std::runtime_error;
|
||||||
using fmt::format;
|
using fmt::format;
|
||||||
using std::string;
|
using std::string;
|
||||||
using namespace little_endian;
|
using namespace little_endian;
|
||||||
|
using std::unique_ptr;
|
||||||
|
using std::make_unique;
|
||||||
|
using std::make_shared;
|
||||||
|
using boost::filesystem::path;
|
||||||
|
|
||||||
#define INT24_MIN (-8388608)
|
#define INT24_MIN (-8388608)
|
||||||
#define INT24_MAX 8388607
|
#define INT24_MAX 8388607
|
||||||
|
@ -25,12 +29,34 @@ enum class Codec {
|
||||||
Float = 0x03
|
Float = 0x03
|
||||||
};
|
};
|
||||||
|
|
||||||
WaveFileReader::WaveFileReader(boost::filesystem::path filePath) :
|
std::ifstream openFile(path filePath) {
|
||||||
|
try {
|
||||||
|
std::ifstream file;
|
||||||
|
file.exceptions(std::ifstream::failbit | std::ifstream::badbit);
|
||||||
|
file.open(filePath.c_str(), std::ios::binary);
|
||||||
|
|
||||||
|
// Error messages on stream exceptions are mostly useless.
|
||||||
|
// Read some dummy data so that we can throw a decent exception in case the file is missing, locked, etc.
|
||||||
|
file.seekg(0, std::ios_base::end);
|
||||||
|
if (file.tellg()) {
|
||||||
|
file.seekg(0);
|
||||||
|
file.get();
|
||||||
|
file.seekg(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
return std::move(file);
|
||||||
|
} catch (const std::ifstream::failure&) {
|
||||||
|
char message[256];
|
||||||
|
strerror_s(message, sizeof message, errno);
|
||||||
|
throw runtime_error(message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
WaveFileReader::WaveFileReader(path filePath) :
|
||||||
filePath(filePath),
|
filePath(filePath),
|
||||||
file(),
|
formatInfo{}
|
||||||
frameIndex(0)
|
|
||||||
{
|
{
|
||||||
openFile();
|
auto file = openFile(filePath);
|
||||||
|
|
||||||
file.seekg(0, std::ios_base::end);
|
file.seekg(0, std::ios_base::end);
|
||||||
std::streamoff fileSize = file.tellg();
|
std::streamoff fileSize = file.tellg();
|
||||||
|
@ -57,16 +83,15 @@ WaveFileReader::WaveFileReader(boost::filesystem::path filePath) :
|
||||||
|
|
||||||
// Read chunks until we reach the data chunk
|
// Read chunks until we reach the data chunk
|
||||||
bool reachedDataChunk = false;
|
bool reachedDataChunk = false;
|
||||||
bytesPerSample = 0;
|
|
||||||
while (!reachedDataChunk && remaining(8)) {
|
while (!reachedDataChunk && remaining(8)) {
|
||||||
uint32_t chunkId = read<uint32_t>(file);
|
uint32_t chunkId = read<uint32_t>(file);
|
||||||
int chunkSize = read<uint32_t>(file);
|
int chunkSize = read<uint32_t>(file);
|
||||||
switch (chunkId) {
|
switch (chunkId) {
|
||||||
case fourcc('f', 'm', 't', ' '): {
|
case fourcc('f', 'm', 't', ' '): {
|
||||||
// Read relevant data
|
// Read relevant data
|
||||||
Codec codec = (Codec)read<uint16_t>(file);
|
Codec codec = static_cast<Codec>(read<uint16_t>(file));
|
||||||
channelCount = read<uint16_t>(file);
|
formatInfo.channelCount = read<uint16_t>(file);
|
||||||
frameRate = read<uint32_t>(file);
|
formatInfo.frameRate = read<uint32_t>(file);
|
||||||
read<uint32_t>(file); // Bytes per second
|
read<uint32_t>(file); // Bytes per second
|
||||||
int frameSize = read<uint16_t>(file);
|
int frameSize = read<uint16_t>(file);
|
||||||
int bitsPerSample = read<uint16_t>(file);
|
int bitsPerSample = read<uint16_t>(file);
|
||||||
|
@ -75,31 +100,32 @@ WaveFileReader::WaveFileReader(boost::filesystem::path filePath) :
|
||||||
file.seekg(roundToEven(chunkSize) - 16, file.cur);
|
file.seekg(roundToEven(chunkSize) - 16, file.cur);
|
||||||
|
|
||||||
// Determine sample format
|
// Determine sample format
|
||||||
|
int bytesPerSample;
|
||||||
switch (codec) {
|
switch (codec) {
|
||||||
case Codec::PCM:
|
case Codec::PCM:
|
||||||
// Determine sample size.
|
// Determine sample size.
|
||||||
// According to the WAVE standard, sample sizes that are not multiples of 8 bits
|
// According to the WAVE standard, sample sizes that are not multiples of 8 bits
|
||||||
// (e.g. 12 bits) can be treated like the next-larger byte size.
|
// (e.g. 12 bits) can be treated like the next-larger byte size.
|
||||||
if (bitsPerSample == 8) {
|
if (bitsPerSample == 8) {
|
||||||
sampleFormat = SampleFormat::UInt8;
|
formatInfo.sampleFormat = SampleFormat::UInt8;
|
||||||
bytesPerSample = 1;
|
bytesPerSample = 1;
|
||||||
} else if (bitsPerSample <= 16) {
|
} else if (bitsPerSample <= 16) {
|
||||||
sampleFormat = SampleFormat::Int16;
|
formatInfo.sampleFormat = SampleFormat::Int16;
|
||||||
bytesPerSample = 2;
|
bytesPerSample = 2;
|
||||||
} else if (bitsPerSample <= 24) {
|
} else if (bitsPerSample <= 24) {
|
||||||
sampleFormat = SampleFormat::Int24;
|
formatInfo.sampleFormat = SampleFormat::Int24;
|
||||||
bytesPerSample = 3;
|
bytesPerSample = 3;
|
||||||
} else {
|
} else {
|
||||||
throw runtime_error(
|
throw runtime_error(
|
||||||
format("Unsupported sample format: {}-bit integer samples.", bitsPerSample));
|
format("Unsupported sample format: {}-bit integer samples.", bitsPerSample));
|
||||||
}
|
}
|
||||||
if (bytesPerSample != frameSize / channelCount) {
|
if (bytesPerSample != frameSize / formatInfo.channelCount) {
|
||||||
throw runtime_error("Unsupported sample organization.");
|
throw runtime_error("Unsupported sample organization.");
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case Codec::Float:
|
case Codec::Float:
|
||||||
if (bitsPerSample == 32) {
|
if (bitsPerSample == 32) {
|
||||||
sampleFormat = SampleFormat::Float32;
|
formatInfo.sampleFormat = SampleFormat::Float32;
|
||||||
bytesPerSample = 4;
|
bytesPerSample = 4;
|
||||||
} else {
|
} else {
|
||||||
throw runtime_error(format("Unsupported sample format: {}-bit floating-point samples.", bitsPerSample));
|
throw runtime_error(format("Unsupported sample format: {}-bit floating-point samples.", bitsPerSample));
|
||||||
|
@ -108,13 +134,13 @@ WaveFileReader::WaveFileReader(boost::filesystem::path filePath) :
|
||||||
default:
|
default:
|
||||||
throw runtime_error("Unsupported sample format. Only uncompressed formats are supported.");
|
throw runtime_error("Unsupported sample format. Only uncompressed formats are supported.");
|
||||||
}
|
}
|
||||||
|
formatInfo.bytesPerFrame = bytesPerSample * formatInfo.channelCount;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case fourcc('d', 'a', 't', 'a'): {
|
case fourcc('d', 'a', 't', 'a'): {
|
||||||
reachedDataChunk = true;
|
reachedDataChunk = true;
|
||||||
dataOffset = file.tellg();
|
formatInfo.dataOffset = file.tellg();
|
||||||
int sampleCount = chunkSize / bytesPerSample;
|
formatInfo.frameCount = chunkSize / formatInfo.bytesPerFrame;
|
||||||
frameCount = sampleCount / channelCount;
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
default: {
|
default: {
|
||||||
|
@ -124,75 +150,13 @@ WaveFileReader::WaveFileReader(boost::filesystem::path filePath) :
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!reachedDataChunk) {
|
|
||||||
dataOffset = file.tellg();
|
|
||||||
frameCount = 0;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
WaveFileReader::WaveFileReader(const WaveFileReader& rhs, bool reset) :
|
unique_ptr<AudioClip> WaveFileReader::clone() const {
|
||||||
filePath(rhs.filePath),
|
return make_unique<WaveFileReader>(*this);
|
||||||
file(),
|
|
||||||
bytesPerSample(rhs.bytesPerSample),
|
|
||||||
sampleFormat(rhs.sampleFormat),
|
|
||||||
frameRate(rhs.frameRate),
|
|
||||||
frameCount(rhs.frameCount),
|
|
||||||
channelCount(rhs.channelCount),
|
|
||||||
dataOffset(rhs.dataOffset),
|
|
||||||
frameIndex(-1)
|
|
||||||
{
|
|
||||||
openFile();
|
|
||||||
seek(reset ? 0 : rhs.frameIndex);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::unique_ptr<AudioStream> WaveFileReader::clone(bool reset) const {
|
inline AudioClip::value_type readSample(std::ifstream& file, SampleFormat sampleFormat, int channelCount) {
|
||||||
return std::make_unique<WaveFileReader>(*this, reset);
|
|
||||||
}
|
|
||||||
|
|
||||||
void WaveFileReader::openFile() {
|
|
||||||
try {
|
|
||||||
file.exceptions(std::ifstream::failbit | std::ifstream::badbit);
|
|
||||||
file.open(filePath, std::ios::binary);
|
|
||||||
|
|
||||||
// Error messages on stream exceptions are mostly useless.
|
|
||||||
// Read some dummy data so that we can throw a decent exception in case the file is missing, locked, etc.
|
|
||||||
file.seekg(0, std::ios_base::end);
|
|
||||||
if (file.tellg()) {
|
|
||||||
file.seekg(0);
|
|
||||||
file.get();
|
|
||||||
file.seekg(0);
|
|
||||||
}
|
|
||||||
} catch (const std::ifstream::failure&) {
|
|
||||||
char message[256];
|
|
||||||
strerror_s(message, sizeof message, errno);
|
|
||||||
throw runtime_error(message);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int WaveFileReader::getSampleRate() const {
|
|
||||||
return frameRate;
|
|
||||||
}
|
|
||||||
|
|
||||||
int64_t WaveFileReader::getSampleCount() const {
|
|
||||||
return frameCount;
|
|
||||||
}
|
|
||||||
|
|
||||||
int64_t WaveFileReader::getSampleIndex() const {
|
|
||||||
return frameIndex;
|
|
||||||
}
|
|
||||||
|
|
||||||
void WaveFileReader::seek(int64_t frameIndex) {
|
|
||||||
if (frameIndex < 0 || frameIndex > frameCount) throw std::invalid_argument("frameIndex out of range.");
|
|
||||||
|
|
||||||
file.seekg(dataOffset + static_cast<std::streamoff>(frameIndex * channelCount * bytesPerSample));
|
|
||||||
this->frameIndex = frameIndex;
|
|
||||||
}
|
|
||||||
|
|
||||||
float WaveFileReader::readSample() {
|
|
||||||
if (frameIndex >= frameCount) throw std::out_of_range("End of stream.");
|
|
||||||
++frameIndex;
|
|
||||||
|
|
||||||
float sum = 0;
|
float sum = 0;
|
||||||
for (int channelIndex = 0; channelIndex < channelCount; channelIndex++) {
|
for (int channelIndex = 0; channelIndex < channelCount; channelIndex++) {
|
||||||
switch (sampleFormat) {
|
switch (sampleFormat) {
|
||||||
|
@ -221,3 +185,13 @@ float WaveFileReader::readSample() {
|
||||||
|
|
||||||
return sum / channelCount;
|
return sum / channelCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SampleReader WaveFileReader::createUnsafeSampleReader() const {
|
||||||
|
return [formatInfo = formatInfo, file = std::make_shared<std::ifstream>(openFile(filePath)), filePos = std::streampos(0)](size_type index) mutable {
|
||||||
|
std::streampos newFilePos = formatInfo.dataOffset + static_cast<std::streamoff>(index * formatInfo.bytesPerFrame);
|
||||||
|
file->seekg(newFilePos);
|
||||||
|
value_type result = readSample(*file, formatInfo.sampleFormat, formatInfo.channelCount);
|
||||||
|
filePos = newFilePos + static_cast<std::streamoff>(formatInfo.bytesPerFrame);
|
||||||
|
return result;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
|
@ -1,8 +1,7 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <boost/filesystem/path.hpp>
|
#include <boost/filesystem/path.hpp>
|
||||||
#include <boost/filesystem/fstream.hpp>
|
#include "AudioClip.h"
|
||||||
#include "AudioStream.h"
|
|
||||||
|
|
||||||
enum class SampleFormat {
|
enum class SampleFormat {
|
||||||
UInt8,
|
UInt8,
|
||||||
|
@ -11,28 +10,33 @@ enum class SampleFormat {
|
||||||
Float32
|
Float32
|
||||||
};
|
};
|
||||||
|
|
||||||
class WaveFileReader : public AudioStream {
|
class WaveFileReader : public AudioClip {
|
||||||
public:
|
public:
|
||||||
WaveFileReader(boost::filesystem::path filePath);
|
WaveFileReader(boost::filesystem::path filePath);
|
||||||
WaveFileReader(const WaveFileReader& rhs, bool reset);
|
std::unique_ptr<AudioClip> clone() const override;
|
||||||
std::unique_ptr<AudioStream> clone(bool reset) const override;
|
int getSampleRate() const override;
|
||||||
int getSampleRate() const override ;
|
size_type size() const override;
|
||||||
int64_t getSampleCount() const override;
|
|
||||||
int64_t getSampleIndex() const override;
|
|
||||||
void seek(int64_t sampleIndex) override;
|
|
||||||
float readSample() override;
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void openFile();
|
SampleReader createUnsafeSampleReader() const override;
|
||||||
|
|
||||||
|
struct WaveFormatInfo {
|
||||||
|
int bytesPerFrame;
|
||||||
|
SampleFormat sampleFormat;
|
||||||
|
int frameRate;
|
||||||
|
int64_t frameCount;
|
||||||
|
int channelCount;
|
||||||
|
std::streampos dataOffset;
|
||||||
|
};
|
||||||
|
|
||||||
private:
|
|
||||||
boost::filesystem::path filePath;
|
boost::filesystem::path filePath;
|
||||||
boost::filesystem::ifstream file;
|
WaveFormatInfo formatInfo;
|
||||||
int bytesPerSample;
|
|
||||||
SampleFormat sampleFormat;
|
|
||||||
int frameRate;
|
|
||||||
int64_t frameCount;
|
|
||||||
int channelCount;
|
|
||||||
std::streampos dataOffset;
|
|
||||||
int64_t frameIndex;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
inline int WaveFileReader::getSampleRate() const {
|
||||||
|
return formatInfo.frameRate;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline AudioClip::size_type WaveFileReader::size() const {
|
||||||
|
return formatInfo.frameCount;
|
||||||
|
}
|
||||||
|
|
|
@ -10,31 +10,31 @@ inline int16_t floatSampleToInt16(float sample) {
|
||||||
return static_cast<int16_t>(((sample + 1) / 2) * (INT16_MAX - INT16_MIN) + INT16_MIN);
|
return static_cast<int16_t>(((sample + 1) / 2) * (INT16_MAX - INT16_MIN) + INT16_MIN);
|
||||||
}
|
}
|
||||||
|
|
||||||
void process16bitAudioStream(AudioStream& audioStream, function<void(const vector<int16_t>&)> processBuffer, size_t bufferCapacity, ProgressSink& progressSink) {
|
void process16bitAudioClip(const AudioClip& audioClip, function<void(const vector<int16_t>&)> processBuffer, size_t bufferCapacity, ProgressSink& progressSink) {
|
||||||
// Process entire sound stream
|
// Process entire sound stream
|
||||||
vector<int16_t> buffer;
|
vector<int16_t> buffer;
|
||||||
buffer.reserve(bufferCapacity);
|
buffer.reserve(bufferCapacity);
|
||||||
int sampleCount = 0;
|
int sampleCount = 0;
|
||||||
|
auto it = audioClip.begin();
|
||||||
|
auto end = audioClip.end();
|
||||||
do {
|
do {
|
||||||
// Read to buffer
|
// Read to buffer
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
while (buffer.size() < bufferCapacity && !audioStream.endOfStream()) {
|
for (; buffer.size() < bufferCapacity && it != end; ++it) {
|
||||||
// Read sample
|
// Read sample to buffer
|
||||||
float floatSample = audioStream.readSample();
|
buffer.push_back(floatSampleToInt16(*it));
|
||||||
int16_t sample = floatSampleToInt16(floatSample);
|
|
||||||
buffer.push_back(sample);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Process buffer
|
// Process buffer
|
||||||
processBuffer(buffer);
|
processBuffer(buffer);
|
||||||
|
|
||||||
sampleCount += buffer.size();
|
sampleCount += buffer.size();
|
||||||
progressSink.reportProgress(static_cast<double>(sampleCount) / audioStream.getSampleCount());
|
progressSink.reportProgress(static_cast<double>(sampleCount) / audioClip.size());
|
||||||
} while (buffer.size());
|
} while (buffer.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
void process16bitAudioStream(AudioStream& audioStream, function<void(const vector<int16_t>&)> processBuffer, ProgressSink& progressSink) {
|
void process16bitAudioClip(const AudioClip& audioClip, function<void(const vector<int16_t>&)> processBuffer, ProgressSink& progressSink) {
|
||||||
const size_t capacity = 1600; // 0.1 second capacity
|
const size_t capacity = 1600; // 0.1 second capacity
|
||||||
process16bitAudioStream(audioStream, processBuffer, capacity, progressSink);
|
process16bitAudioClip(audioClip, processBuffer, capacity, progressSink);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -2,8 +2,8 @@
|
||||||
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <functional>
|
#include <functional>
|
||||||
#include "audio/AudioStream.h"
|
#include "audio/AudioClip.h"
|
||||||
#include "ProgressBar.h"
|
#include "ProgressBar.h"
|
||||||
|
|
||||||
void process16bitAudioStream(AudioStream& audioStream, std::function<void(const std::vector<int16_t>&)> processBuffer, size_t bufferCapacity, ProgressSink& progressSink);
|
void process16bitAudioClip(const AudioClip& audioClip, std::function<void(const std::vector<int16_t>&)> processBuffer, size_t bufferCapacity, ProgressSink& progressSink);
|
||||||
void process16bitAudioStream(AudioStream& audioStream, std::function<void(const std::vector<int16_t>&)> processBuffer, ProgressSink& progressSink);
|
void process16bitAudioClip(const AudioClip& audioClip, std::function<void(const std::vector<int16_t>&)> processBuffer, ProgressSink& progressSink);
|
|
@ -8,7 +8,7 @@
|
||||||
#include "processing.h"
|
#include "processing.h"
|
||||||
#include <gsl_util.h>
|
#include <gsl_util.h>
|
||||||
#include <parallel.h>
|
#include <parallel.h>
|
||||||
#include "AudioStreamSegment.h"
|
#include "AudioSegment.h"
|
||||||
|
|
||||||
using std::vector;
|
using std::vector;
|
||||||
using boost::adaptors::transformed;
|
using boost::adaptors::transformed;
|
||||||
|
@ -16,7 +16,7 @@ using fmt::format;
|
||||||
using std::runtime_error;
|
using std::runtime_error;
|
||||||
using std::unique_ptr;
|
using std::unique_ptr;
|
||||||
|
|
||||||
BoundedTimeline<void> webRtcDetectVoiceActivity(AudioStream& audioStream, ProgressSink& progressSink) {
|
BoundedTimeline<void> webRtcDetectVoiceActivity(const AudioClip& audioClip, ProgressSink& progressSink) {
|
||||||
VadInst* vadHandle = WebRtcVad_Create();
|
VadInst* vadHandle = WebRtcVad_Create();
|
||||||
if (!vadHandle) throw runtime_error("Error creating WebRTC VAD handle.");
|
if (!vadHandle) throw runtime_error("Error creating WebRTC VAD handle.");
|
||||||
|
|
||||||
|
@ -30,14 +30,14 @@ BoundedTimeline<void> webRtcDetectVoiceActivity(AudioStream& audioStream, Progre
|
||||||
if (error) throw runtime_error("Error setting WebRTC VAD aggressiveness.");
|
if (error) throw runtime_error("Error setting WebRTC VAD aggressiveness.");
|
||||||
|
|
||||||
// Detect activity
|
// Detect activity
|
||||||
BoundedTimeline<void> activity(audioStream.getTruncatedRange());
|
BoundedTimeline<void> activity(audioClip.getTruncatedRange());
|
||||||
centiseconds time = 0cs;
|
centiseconds time = 0cs;
|
||||||
const size_t bufferCapacity = audioStream.getSampleRate() / 100;
|
const size_t bufferCapacity = audioClip.getSampleRate() / 100;
|
||||||
auto processBuffer = [&](const vector<int16_t>& buffer) {
|
auto processBuffer = [&](const vector<int16_t>& buffer) {
|
||||||
// WebRTC is picky regarding buffer size
|
// WebRTC is picky regarding buffer size
|
||||||
if (buffer.size() < bufferCapacity) return;
|
if (buffer.size() < bufferCapacity) return;
|
||||||
|
|
||||||
int result = WebRtcVad_Process(vadHandle, audioStream.getSampleRate(), buffer.data(), buffer.size()) == 1;
|
int result = WebRtcVad_Process(vadHandle, audioClip.getSampleRate(), buffer.data(), buffer.size()) == 1;
|
||||||
if (result == -1) throw runtime_error("Error processing audio buffer using WebRTC VAD.");
|
if (result == -1) throw runtime_error("Error processing audio buffer using WebRTC VAD.");
|
||||||
|
|
||||||
bool isActive = result != 0;
|
bool isActive = result != 0;
|
||||||
|
@ -46,7 +46,7 @@ BoundedTimeline<void> webRtcDetectVoiceActivity(AudioStream& audioStream, Progre
|
||||||
}
|
}
|
||||||
time += 1cs;
|
time += 1cs;
|
||||||
};
|
};
|
||||||
process16bitAudioStream(*audioStream.clone(true), processBuffer, bufferCapacity, progressSink);
|
process16bitAudioClip(audioClip, processBuffer, bufferCapacity, progressSink);
|
||||||
|
|
||||||
// WebRTC adapts to the audio. This means results may not be correct at the very beginning.
|
// WebRTC adapts to the audio. This means results may not be correct at the very beginning.
|
||||||
// It sometimes returns false activity at the very beginning, mistaking the background noise for speech.
|
// It sometimes returns false activity at the very beginning, mistaking the background noise for speech.
|
||||||
|
@ -54,31 +54,31 @@ BoundedTimeline<void> webRtcDetectVoiceActivity(AudioStream& audioStream, Progre
|
||||||
if (!activity.empty()) {
|
if (!activity.empty()) {
|
||||||
TimeRange firstActivity = activity.begin()->getTimeRange();
|
TimeRange firstActivity = activity.begin()->getTimeRange();
|
||||||
activity.clear(firstActivity);
|
activity.clear(firstActivity);
|
||||||
unique_ptr<AudioStream> streamStart = createSegment(audioStream.clone(true), TimeRange(0cs, firstActivity.getEnd()));
|
unique_ptr<AudioClip> streamStart = audioClip.clone() | segment(TimeRange(0cs, firstActivity.getEnd()));
|
||||||
time = 0cs;
|
time = 0cs;
|
||||||
process16bitAudioStream(*streamStart, processBuffer, bufferCapacity, progressSink);
|
process16bitAudioClip(*streamStart, processBuffer, bufferCapacity, progressSink);
|
||||||
}
|
}
|
||||||
|
|
||||||
return activity;
|
return activity;
|
||||||
}
|
}
|
||||||
|
|
||||||
BoundedTimeline<void> detectVoiceActivity(std::unique_ptr<AudioStream> audioStream, ProgressSink& progressSink) {
|
BoundedTimeline<void> detectVoiceActivity(const AudioClip& inputAudioClip, ProgressSink& progressSink) {
|
||||||
// Prepare audio for VAD
|
// Prepare audio for VAD
|
||||||
audioStream = removeDCOffset(convertSampleRate(std::move(audioStream), 16000));
|
const unique_ptr<AudioClip> audioClip = inputAudioClip.clone() | resample(16000) | removeDCOffset();
|
||||||
|
|
||||||
BoundedTimeline<void> activity(audioStream->getTruncatedRange());
|
BoundedTimeline<void> activity(audioClip->getTruncatedRange());
|
||||||
std::mutex activityMutex;
|
std::mutex activityMutex;
|
||||||
|
|
||||||
// Split audio into segments and perform parallel VAD
|
// Split audio into segments and perform parallel VAD
|
||||||
int segmentCount = getProcessorCoreCount();
|
int segmentCount = getProcessorCoreCount();
|
||||||
centiseconds audioLength = audioStream->getTruncatedRange().getLength();
|
centiseconds audioLength = audioClip->getTruncatedRange().getLength();
|
||||||
vector<TimeRange> audioSegments;
|
vector<TimeRange> audioSegments;
|
||||||
for (int i = 0; i < segmentCount; ++i) {
|
for (int i = 0; i < segmentCount; ++i) {
|
||||||
TimeRange segmentRange = TimeRange(i * audioLength / segmentCount, (i + 1) * audioLength / segmentCount);
|
TimeRange segmentRange = TimeRange(i * audioLength / segmentCount, (i + 1) * audioLength / segmentCount);
|
||||||
audioSegments.push_back(segmentRange);
|
audioSegments.push_back(segmentRange);
|
||||||
}
|
}
|
||||||
runParallel([&](const TimeRange& segmentRange, ProgressSink& segmentProgressSink) {
|
runParallel([&](const TimeRange& segmentRange, ProgressSink& segmentProgressSink) {
|
||||||
unique_ptr<AudioStream> audioSegment = createSegment(audioStream->clone(false), segmentRange);
|
unique_ptr<AudioClip> audioSegment = audioClip->clone() | segment(segmentRange);
|
||||||
BoundedTimeline<void> activitySegment = webRtcDetectVoiceActivity(*audioSegment, segmentProgressSink);
|
BoundedTimeline<void> activitySegment = webRtcDetectVoiceActivity(*audioSegment, segmentProgressSink);
|
||||||
|
|
||||||
std::lock_guard<std::mutex> lock(activityMutex);
|
std::lock_guard<std::mutex> lock(activityMutex);
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
#include <memory>
|
#include "AudioClip.h"
|
||||||
#include "AudioStream.h"
|
|
||||||
#include <BoundedTimeline.h>
|
#include <BoundedTimeline.h>
|
||||||
#include <ProgressBar.h>
|
#include <ProgressBar.h>
|
||||||
|
|
||||||
BoundedTimeline<void> detectVoiceActivity(std::unique_ptr<AudioStream> audioStream, ProgressSink& progressSink);
|
BoundedTimeline<void> detectVoiceActivity(const AudioClip& audioClip, ProgressSink& progressSink);
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
|
|
||||||
using namespace little_endian;
|
using namespace little_endian;
|
||||||
|
|
||||||
void createWaveFile(std::unique_ptr<AudioStream> inputStream, std::string fileName) {
|
void createWaveFile(const AudioClip& audioClip, std::string fileName) {
|
||||||
// Open file
|
// Open file
|
||||||
std::ofstream file;
|
std::ofstream file;
|
||||||
file.exceptions(std::ofstream::failbit | std::ofstream::badbit);
|
file.exceptions(std::ofstream::failbit | std::ofstream::badbit);
|
||||||
|
@ -15,7 +15,7 @@ void createWaveFile(std::unique_ptr<AudioStream> inputStream, std::string fileNa
|
||||||
uint32_t formatChunkSize = 16;
|
uint32_t formatChunkSize = 16;
|
||||||
uint16_t channelCount = 1;
|
uint16_t channelCount = 1;
|
||||||
uint16_t frameSize = static_cast<uint16_t>(channelCount * sizeof(float));
|
uint16_t frameSize = static_cast<uint16_t>(channelCount * sizeof(float));
|
||||||
uint32_t dataChunkSize = static_cast<uint32_t>(inputStream->getSampleCount() * frameSize);
|
uint32_t dataChunkSize = static_cast<uint32_t>(audioClip.size() * frameSize);
|
||||||
uint32_t riffChunkSize = 4 + (8 + formatChunkSize) + (8 + dataChunkSize);
|
uint32_t riffChunkSize = 4 + (8 + formatChunkSize) + (8 + dataChunkSize);
|
||||||
write<uint32_t>(riffChunkSize, file);
|
write<uint32_t>(riffChunkSize, file);
|
||||||
write<uint32_t>(fourcc('W', 'A', 'V', 'E'), file);
|
write<uint32_t>(fourcc('W', 'A', 'V', 'E'), file);
|
||||||
|
@ -26,7 +26,7 @@ void createWaveFile(std::unique_ptr<AudioStream> inputStream, std::string fileNa
|
||||||
uint16_t codec = 0x03; // 32-bit float
|
uint16_t codec = 0x03; // 32-bit float
|
||||||
write<uint16_t>(codec, file);
|
write<uint16_t>(codec, file);
|
||||||
write<uint16_t>(channelCount, file);
|
write<uint16_t>(channelCount, file);
|
||||||
uint32_t frameRate = static_cast<uint16_t>(inputStream->getSampleRate());
|
uint32_t frameRate = static_cast<uint16_t>(audioClip.getSampleRate());
|
||||||
write<uint32_t>(frameRate, file);
|
write<uint32_t>(frameRate, file);
|
||||||
uint32_t bytesPerSecond = frameRate * frameSize;
|
uint32_t bytesPerSecond = frameRate * frameSize;
|
||||||
write<uint32_t>(bytesPerSecond, file);
|
write<uint32_t>(bytesPerSecond, file);
|
||||||
|
@ -37,8 +37,7 @@ void createWaveFile(std::unique_ptr<AudioStream> inputStream, std::string fileNa
|
||||||
// Write data chunk
|
// Write data chunk
|
||||||
write<uint32_t>(fourcc('d', 'a', 't', 'a'), file);
|
write<uint32_t>(fourcc('d', 'a', 't', 'a'), file);
|
||||||
write<uint32_t>(dataChunkSize, file);
|
write<uint32_t>(dataChunkSize, file);
|
||||||
while (!inputStream->endOfStream()) {
|
for (float sample : audioClip) {
|
||||||
float sample = inputStream->readSample();
|
|
||||||
write<float>(sample, file);
|
write<float>(sample, file);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,7 +1,5 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <memory>
|
#include "AudioClip.h"
|
||||||
#include <string>
|
|
||||||
#include "AudioStream.h"
|
|
||||||
|
|
||||||
void createWaveFile(std::unique_ptr<AudioStream> inputStream, std::string fileName);
|
void createWaveFile(const AudioClip& audioClip, std::string fileName);
|
||||||
|
|
|
@ -15,6 +15,7 @@
|
||||||
#include <boost/filesystem/operations.hpp>
|
#include <boost/filesystem/operations.hpp>
|
||||||
#include "stringTools.h"
|
#include "stringTools.h"
|
||||||
#include <boost/range/adaptor/transformed.hpp>
|
#include <boost/range/adaptor/transformed.hpp>
|
||||||
|
#include <boost/filesystem/fstream.hpp>
|
||||||
|
|
||||||
using std::exception;
|
using std::exception;
|
||||||
using std::string;
|
using std::string;
|
||||||
|
@ -43,7 +44,7 @@ string getMessage(const exception& e) {
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
unique_ptr<AudioStream> createAudioStream(path filePath) {
|
unique_ptr<AudioClip> createAudioClip(path filePath) {
|
||||||
try {
|
try {
|
||||||
return std::make_unique<WaveFileReader>(filePath);
|
return std::make_unique<WaveFileReader>(filePath);
|
||||||
} catch (...) {
|
} catch (...) {
|
||||||
|
@ -144,7 +145,7 @@ int main(int argc, char *argv[]) {
|
||||||
{
|
{
|
||||||
ProgressBar progressBar;
|
ProgressBar progressBar;
|
||||||
phones = detectPhones(
|
phones = detectPhones(
|
||||||
createAudioStream(inputFileName.getValue()),
|
*createAudioClip(inputFileName.getValue()),
|
||||||
dialogFile.isSet() ? readTextFile(path(dialogFile.getValue())) : boost::optional<u32string>(),
|
dialogFile.isSet() ? readTextFile(path(dialogFile.getValue())) : boost::optional<u32string>(),
|
||||||
progressBar);
|
progressBar);
|
||||||
}
|
}
|
||||||
|
|
|
@ -11,7 +11,7 @@
|
||||||
#include <audio/DCOffset.h>
|
#include <audio/DCOffset.h>
|
||||||
#include <Timeline.h>
|
#include <Timeline.h>
|
||||||
#include <audio/voiceActivityDetection.h>
|
#include <audio/voiceActivityDetection.h>
|
||||||
#include <audio/AudioStreamSegment.h>
|
#include "audio/AudioSegment.h"
|
||||||
#include "languageModels.h"
|
#include "languageModels.h"
|
||||||
#include "tokenization.h"
|
#include "tokenization.h"
|
||||||
#include "g2p.h"
|
#include "g2p.h"
|
||||||
|
@ -95,9 +95,9 @@ void sphinxLogCallback(void* user_data, err_lvl_t errorLevel, const char* format
|
||||||
logging::log(logLevel, message);
|
logging::log(logLevel, message);
|
||||||
}
|
}
|
||||||
|
|
||||||
BoundedTimeline<string> recognizeWords(unique_ptr<AudioStream> audioStream, ps_decoder_t& decoder, bool& decoderIsStillUsable, ProgressSink& progressSink) {
|
BoundedTimeline<string> recognizeWords(const AudioClip& inputAudioClip, ps_decoder_t& decoder, bool& decoderIsStillUsable, ProgressSink& progressSink) {
|
||||||
// Convert audio stream to the exact format PocketSphinx requires
|
// Convert audio stream to the exact format PocketSphinx requires
|
||||||
audioStream = convertSampleRate(std::move(audioStream), sphinxSampleRate);
|
const unique_ptr<AudioClip> audioClip = inputAudioClip.clone() | resample(sphinxSampleRate);
|
||||||
|
|
||||||
// Restart timing at 0
|
// Restart timing at 0
|
||||||
ps_start_stream(&decoder);
|
ps_start_stream(&decoder);
|
||||||
|
@ -111,7 +111,7 @@ BoundedTimeline<string> recognizeWords(unique_ptr<AudioStream> audioStream, ps_d
|
||||||
int searchedFrameCount = ps_process_raw(&decoder, buffer.data(), buffer.size(), false, false);
|
int searchedFrameCount = ps_process_raw(&decoder, buffer.data(), buffer.size(), false, false);
|
||||||
if (searchedFrameCount < 0) throw runtime_error("Error analyzing raw audio data for word recognition.");
|
if (searchedFrameCount < 0) throw runtime_error("Error analyzing raw audio data for word recognition.");
|
||||||
};
|
};
|
||||||
process16bitAudioStream(*audioStream.get(), processBuffer, progressSink);
|
process16bitAudioClip(*audioClip, processBuffer, progressSink);
|
||||||
|
|
||||||
// End recognition
|
// End recognition
|
||||||
error = ps_end_utt(&decoder);
|
error = ps_end_utt(&decoder);
|
||||||
|
@ -121,7 +121,7 @@ BoundedTimeline<string> recognizeWords(unique_ptr<AudioStream> audioStream, ps_d
|
||||||
// As a result, the following utterance will be garbage.
|
// As a result, the following utterance will be garbage.
|
||||||
// As a workaround, we throw away the decoder in this case.
|
// As a workaround, we throw away the decoder in this case.
|
||||||
// See https://sourceforge.net/p/cmusphinx/discussion/help/thread/f1dd91c5/#7529
|
// See https://sourceforge.net/p/cmusphinx/discussion/help/thread/f1dd91c5/#7529
|
||||||
BoundedTimeline<string> result(audioStream->getTruncatedRange());
|
BoundedTimeline<string> result(audioClip->getTruncatedRange());
|
||||||
bool noWordsRecognized = reinterpret_cast<ngram_search_t*>(decoder.search)->bpidx == 0;
|
bool noWordsRecognized = reinterpret_cast<ngram_search_t*>(decoder.search)->bpidx == 0;
|
||||||
if (noWordsRecognized) {
|
if (noWordsRecognized) {
|
||||||
decoderIsStillUsable = false;
|
decoderIsStillUsable = false;
|
||||||
|
@ -147,7 +147,7 @@ s3wid_t getWordId(const string& word, dict_t& dictionary) {
|
||||||
|
|
||||||
optional<Timeline<Phone>> getPhoneAlignment(
|
optional<Timeline<Phone>> getPhoneAlignment(
|
||||||
const vector<s3wid_t>& wordIds,
|
const vector<s3wid_t>& wordIds,
|
||||||
unique_ptr<AudioStream> audioStream,
|
const AudioClip& inputAudioClip,
|
||||||
ps_decoder_t& decoder,
|
ps_decoder_t& decoder,
|
||||||
ProgressSink& progressSink)
|
ProgressSink& progressSink)
|
||||||
{
|
{
|
||||||
|
@ -164,7 +164,7 @@ optional<Timeline<Phone>> getPhoneAlignment(
|
||||||
if (error) throw runtime_error("Error populating alignment struct.");
|
if (error) throw runtime_error("Error populating alignment struct.");
|
||||||
|
|
||||||
// Convert audio stream to the exact format PocketSphinx requires
|
// Convert audio stream to the exact format PocketSphinx requires
|
||||||
audioStream = convertSampleRate(std::move(audioStream), sphinxSampleRate);
|
const unique_ptr<AudioClip> audioClip = inputAudioClip.clone() | resample(sphinxSampleRate);
|
||||||
|
|
||||||
// Create search structure
|
// Create search structure
|
||||||
acmod_t* acousticModel = decoder.acmod;
|
acmod_t* acousticModel = decoder.acmod;
|
||||||
|
@ -195,7 +195,7 @@ optional<Timeline<Phone>> getPhoneAlignment(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
process16bitAudioStream(*audioStream.get(), processBuffer, progressSink);
|
process16bitAudioClip(*audioClip, processBuffer, progressSink);
|
||||||
|
|
||||||
// End search
|
// End search
|
||||||
error = ps_search_finish(search.get());
|
error = ps_search_finish(search.get());
|
||||||
|
@ -288,7 +288,7 @@ lambda_unique_ptr<ps_decoder_t> createDecoder(optional<u32string> dialog) {
|
||||||
}
|
}
|
||||||
|
|
||||||
Timeline<Phone> utteranceToPhones(
|
Timeline<Phone> utteranceToPhones(
|
||||||
AudioStream& audioStream,
|
const AudioClip& audioClip,
|
||||||
TimeRange utterance,
|
TimeRange utterance,
|
||||||
ps_decoder_t& decoder,
|
ps_decoder_t& decoder,
|
||||||
bool& decoderIsStillUsable,
|
bool& decoderIsStillUsable,
|
||||||
|
@ -298,10 +298,10 @@ Timeline<Phone> utteranceToPhones(
|
||||||
ProgressSink& wordRecognitionProgressSink = utteranceProgressMerger.addSink(1.0);
|
ProgressSink& wordRecognitionProgressSink = utteranceProgressMerger.addSink(1.0);
|
||||||
ProgressSink& alignmentProgressSink = utteranceProgressMerger.addSink(0.5);
|
ProgressSink& alignmentProgressSink = utteranceProgressMerger.addSink(0.5);
|
||||||
|
|
||||||
auto streamSegment = createSegment(audioStream.clone(true), utterance);
|
const unique_ptr<AudioClip> clipSegment = audioClip.clone() | segment(utterance);
|
||||||
|
|
||||||
// Get words
|
// Get words
|
||||||
BoundedTimeline<string> words = recognizeWords(streamSegment->clone(true), decoder, decoderIsStillUsable, wordRecognitionProgressSink);
|
BoundedTimeline<string> words = recognizeWords(*clipSegment, decoder, decoderIsStillUsable, wordRecognitionProgressSink);
|
||||||
for (Timed<string> timedWord : words) {
|
for (Timed<string> timedWord : words) {
|
||||||
timedWord.getTimeRange().shift(utterance.getStart());
|
timedWord.getTimeRange().shift(utterance.getStart());
|
||||||
logging::logTimedEvent("word", timedWord);
|
logging::logTimedEvent("word", timedWord);
|
||||||
|
@ -315,8 +315,8 @@ Timeline<Phone> utteranceToPhones(
|
||||||
if (wordIds.empty()) return Timeline<Phone>();
|
if (wordIds.empty()) return Timeline<Phone>();
|
||||||
|
|
||||||
// Align the words' phones with speech
|
// Align the words' phones with speech
|
||||||
Timeline<Phone> segmentPhones = getPhoneAlignment(wordIds, std::move(streamSegment), decoder, alignmentProgressSink)
|
Timeline<Phone> segmentPhones = getPhoneAlignment(wordIds, *clipSegment, decoder, alignmentProgressSink)
|
||||||
.value_or(ContinuousTimeline<Phone>(streamSegment->getTruncatedRange(), Phone::Unknown));
|
.value_or(ContinuousTimeline<Phone>(clipSegment->getTruncatedRange(), Phone::Unknown));
|
||||||
segmentPhones.shift(utterance.getStart());
|
segmentPhones.shift(utterance.getStart());
|
||||||
for (const auto& timedPhone : segmentPhones) {
|
for (const auto& timedPhone : segmentPhones) {
|
||||||
logging::logTimedEvent("phone", timedPhone);
|
logging::logTimedEvent("phone", timedPhone);
|
||||||
|
@ -326,7 +326,7 @@ Timeline<Phone> utteranceToPhones(
|
||||||
}
|
}
|
||||||
|
|
||||||
BoundedTimeline<Phone> detectPhones(
|
BoundedTimeline<Phone> detectPhones(
|
||||||
unique_ptr<AudioStream> audioStream,
|
const AudioClip& inputAudioClip,
|
||||||
optional<u32string> dialog,
|
optional<u32string> dialog,
|
||||||
ProgressSink& progressSink)
|
ProgressSink& progressSink)
|
||||||
{
|
{
|
||||||
|
@ -335,12 +335,12 @@ BoundedTimeline<Phone> detectPhones(
|
||||||
ProgressSink& dialogProgressSink = totalProgressMerger.addSink(15);
|
ProgressSink& dialogProgressSink = totalProgressMerger.addSink(15);
|
||||||
|
|
||||||
// Make sure audio stream has no DC offset
|
// Make sure audio stream has no DC offset
|
||||||
audioStream = removeDCOffset(std::move(audioStream));
|
const unique_ptr<AudioClip> audioClip = inputAudioClip.clone() | removeDCOffset();
|
||||||
|
|
||||||
// Split audio into utterances
|
// Split audio into utterances
|
||||||
BoundedTimeline<void> utterances;
|
BoundedTimeline<void> utterances;
|
||||||
try {
|
try {
|
||||||
utterances = detectVoiceActivity(audioStream->clone(true), voiceActivationProgressSink);
|
utterances = detectVoiceActivity(*audioClip, voiceActivationProgressSink);
|
||||||
}
|
}
|
||||||
catch (...) {
|
catch (...) {
|
||||||
std::throw_with_nested(runtime_error("Error detecting segments of speech."));
|
std::throw_with_nested(runtime_error("Error detecting segments of speech."));
|
||||||
|
@ -369,17 +369,16 @@ BoundedTimeline<Phone> detectPhones(
|
||||||
decoderPool.push(std::move(decoder));
|
decoderPool.push(std::move(decoder));
|
||||||
};
|
};
|
||||||
|
|
||||||
BoundedTimeline<Phone> result(audioStream->getTruncatedRange());
|
BoundedTimeline<Phone> result(audioClip->getTruncatedRange());
|
||||||
std::mutex resultMutex;
|
std::mutex resultMutex;
|
||||||
auto processUtterance = [&](Timed<void> timedUtterance, ProgressSink& utteranceProgressSink) {
|
auto processUtterance = [&](Timed<void> timedUtterance, ProgressSink& utteranceProgressSink) {
|
||||||
logging::logTimedEvent("utterance", timedUtterance.getTimeRange(), string(""));
|
logging::logTimedEvent("utterance", timedUtterance.getTimeRange(), string(""));
|
||||||
|
|
||||||
// Detect phones for utterance
|
// Detect phones for utterance
|
||||||
auto decoder = getDecoder();
|
auto decoder = getDecoder();
|
||||||
auto audioStreamCopy = audioStream->clone(true);
|
|
||||||
bool decoderIsStillUsable = true;
|
bool decoderIsStillUsable = true;
|
||||||
Timeline<Phone> phones =
|
Timeline<Phone> phones =
|
||||||
utteranceToPhones(*audioStreamCopy, timedUtterance.getTimeRange(), *decoder, decoderIsStillUsable, utteranceProgressSink);
|
utteranceToPhones(*audioClip, timedUtterance.getTimeRange(), *decoder, decoderIsStillUsable, utteranceProgressSink);
|
||||||
if (decoderIsStillUsable) {
|
if (decoderIsStillUsable) {
|
||||||
returnDecoder(std::move(decoder));
|
returnDecoder(std::move(decoder));
|
||||||
}
|
}
|
||||||
|
@ -404,7 +403,7 @@ BoundedTimeline<Phone> detectPhones(
|
||||||
// Don't use more threads than there are utterances to be processed
|
// Don't use more threads than there are utterances to be processed
|
||||||
static_cast<int>(utterances.size()),
|
static_cast<int>(utterances.size()),
|
||||||
// Don't waste time creating additional threads (and decoders!) if the recording is short
|
// Don't waste time creating additional threads (and decoders!) if the recording is short
|
||||||
static_cast<int>(duration_cast<std::chrono::seconds>(audioStream->getTruncatedRange().getLength()).count() / 10)
|
static_cast<int>(duration_cast<std::chrono::seconds>(audioClip->getTruncatedRange().getLength()).count() / 10)
|
||||||
});
|
});
|
||||||
logging::debug("Speech recognition -- start");
|
logging::debug("Speech recognition -- start");
|
||||||
runParallel(processUtterance, utterances, threadCount, dialogProgressSink, getUtteranceProgressWeight);
|
runParallel(processUtterance, utterances, threadCount, dialogProgressSink, getUtteranceProgressWeight);
|
||||||
|
|
|
@ -1,12 +1,11 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <memory>
|
#include "audio/AudioClip.h"
|
||||||
#include "audio/AudioStream.h"
|
|
||||||
#include "Phone.h"
|
#include "Phone.h"
|
||||||
#include "progressBar.h"
|
#include "progressBar.h"
|
||||||
#include "BoundedTimeline.h"
|
#include "BoundedTimeline.h"
|
||||||
|
|
||||||
BoundedTimeline<Phone> detectPhones(
|
BoundedTimeline<Phone> detectPhones(
|
||||||
std::unique_ptr<AudioStream> audioStream,
|
const AudioClip& audioClip,
|
||||||
boost::optional<std::u32string> dialog,
|
boost::optional<std::u32string> dialog,
|
||||||
ProgressSink& progressSink);
|
ProgressSink& progressSink);
|
||||||
|
|
Loading…
Reference in New Issue