Refactored audio streams
* All streams are now mono (simplifies reasoning about samples) * Streams can be cloned * Streams can be seeked within
This commit is contained in:
parent
419b0ec469
commit
b78e418a8f
|
@ -100,7 +100,7 @@ set(SOURCE_FILES
|
||||||
src/phoneExtraction.cpp
|
src/phoneExtraction.cpp
|
||||||
src/platformTools.cpp
|
src/platformTools.cpp
|
||||||
src/tools.cpp
|
src/tools.cpp
|
||||||
src/audioInput/ChannelDownmixer.cpp
|
src/audioInput/AudioStream.cpp
|
||||||
src/audioInput/SampleRateConverter.cpp
|
src/audioInput/SampleRateConverter.cpp
|
||||||
src/audioInput/WaveFileReader.cpp
|
src/audioInput/WaveFileReader.cpp
|
||||||
src/audioInput/waveFileWriting.cpp
|
src/audioInput/waveFileWriting.cpp
|
||||||
|
|
|
@ -0,0 +1,5 @@
|
||||||
|
#include "AudioStream.h"
|
||||||
|
|
||||||
|
bool AudioStream::endOfStream() {
|
||||||
|
return getSampleIndex() >= getSampleCount();
|
||||||
|
}
|
|
@ -1,9 +1,16 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
|
||||||
|
// A mono stream of floating-point samples.
|
||||||
class AudioStream {
|
class AudioStream {
|
||||||
public:
|
public:
|
||||||
virtual int getFrameRate() = 0;
|
virtual ~AudioStream() {}
|
||||||
virtual int getFrameCount() = 0;
|
virtual std::unique_ptr<AudioStream> clone(bool reset) = 0;
|
||||||
virtual int getChannelCount() = 0;
|
virtual int getSampleRate() = 0;
|
||||||
virtual bool getNextSample(float &sample) = 0;
|
virtual int getSampleCount() = 0;
|
||||||
|
virtual int getSampleIndex() = 0;
|
||||||
|
virtual void seek(int sampleIndex) = 0;
|
||||||
|
bool endOfStream();
|
||||||
|
virtual float readSample() = 0;
|
||||||
};
|
};
|
||||||
|
|
|
@ -1,31 +0,0 @@
|
||||||
#include "ChannelDownmixer.h"
|
|
||||||
|
|
||||||
ChannelDownmixer::ChannelDownmixer(std::unique_ptr<AudioStream> inputStream) :
|
|
||||||
inputStream(std::move(inputStream)),
|
|
||||||
inputChannelCount(this->inputStream->getChannelCount())
|
|
||||||
{}
|
|
||||||
|
|
||||||
int ChannelDownmixer::getFrameRate() {
|
|
||||||
return inputStream->getFrameRate();
|
|
||||||
}
|
|
||||||
|
|
||||||
int ChannelDownmixer::getFrameCount() {
|
|
||||||
return inputStream->getFrameCount();
|
|
||||||
}
|
|
||||||
|
|
||||||
int ChannelDownmixer::getChannelCount() {
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool ChannelDownmixer::getNextSample(float &sample) {
|
|
||||||
float sum = 0;
|
|
||||||
for (int channelIndex = 0; channelIndex < inputChannelCount; channelIndex++) {
|
|
||||||
float currentSample;
|
|
||||||
if (!inputStream->getNextSample(currentSample)) return false;
|
|
||||||
|
|
||||||
sum += currentSample;
|
|
||||||
}
|
|
||||||
|
|
||||||
sample = sum / inputChannelCount;
|
|
||||||
return true;
|
|
||||||
}
|
|
|
@ -1,18 +0,0 @@
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include "AudioStream.h"
|
|
||||||
#include <memory>
|
|
||||||
|
|
||||||
// Converts a multi-channel audio stream to mono.
|
|
||||||
class ChannelDownmixer : public AudioStream {
|
|
||||||
public:
|
|
||||||
ChannelDownmixer(std::unique_ptr<AudioStream> inputStream);
|
|
||||||
virtual int getFrameRate() override;
|
|
||||||
virtual int getFrameCount() override;
|
|
||||||
virtual int getChannelCount() override;
|
|
||||||
virtual bool getNextSample(float &sample) override;
|
|
||||||
|
|
||||||
private:
|
|
||||||
std::unique_ptr<AudioStream> inputStream;
|
|
||||||
int inputChannelCount;
|
|
||||||
};
|
|
|
@ -1,80 +1,94 @@
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include "SampleRateConverter.h"
|
#include "SampleRateConverter.h"
|
||||||
|
#include <stdexcept>
|
||||||
|
#include <algorithm>
|
||||||
|
|
||||||
using std::runtime_error;
|
using std::runtime_error;
|
||||||
|
|
||||||
SampleRateConverter::SampleRateConverter(std::unique_ptr<AudioStream> inputStream, int outputFrameRate) :
|
SampleRateConverter::SampleRateConverter(std::unique_ptr<AudioStream> inputStream, int outputFrameRate) :
|
||||||
inputStream(std::move(inputStream)),
|
inputStream(std::move(inputStream)),
|
||||||
downscalingFactor(static_cast<double>(this->inputStream->getFrameRate()) / outputFrameRate),
|
downscalingFactor(static_cast<double>(this->inputStream->getSampleRate()) / outputFrameRate),
|
||||||
outputFrameRate(outputFrameRate),
|
outputFrameRate(outputFrameRate),
|
||||||
outputFrameCount(std::lround(this->inputStream->getFrameCount() / downscalingFactor)),
|
outputFrameCount(std::lround(this->inputStream->getSampleCount() / downscalingFactor)),
|
||||||
lastInputSample(0),
|
lastInputSample(0),
|
||||||
lastInputSampleIndex(-1),
|
lastInputSampleIndex(-1),
|
||||||
nextOutputSampleIndex(0)
|
nextOutputSampleIndex(0)
|
||||||
{
|
{
|
||||||
if (this->inputStream->getChannelCount() != 1) {
|
if (this->inputStream->getSampleRate() < outputFrameRate) {
|
||||||
throw runtime_error("Only mono input streams are supported.");
|
|
||||||
}
|
|
||||||
if (this->inputStream->getFrameRate() < outputFrameRate) {
|
|
||||||
throw runtime_error("Upsampling not supported.");
|
throw runtime_error("Upsampling not supported.");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int SampleRateConverter::getFrameRate() {
|
SampleRateConverter::SampleRateConverter(const SampleRateConverter& rhs, bool reset) :
|
||||||
|
SampleRateConverter(rhs.inputStream->clone(reset), outputFrameRate)
|
||||||
|
{
|
||||||
|
nextOutputSampleIndex = reset ? 0 : rhs.nextOutputSampleIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::unique_ptr<AudioStream> SampleRateConverter::clone(bool reset) {
|
||||||
|
return std::make_unique<SampleRateConverter>(*this, reset);
|
||||||
|
}
|
||||||
|
|
||||||
|
int SampleRateConverter::getSampleRate() {
|
||||||
return outputFrameRate;
|
return outputFrameRate;
|
||||||
}
|
}
|
||||||
|
|
||||||
int SampleRateConverter::getFrameCount() {
|
int SampleRateConverter::getSampleCount() {
|
||||||
return outputFrameCount;
|
return outputFrameCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
int SampleRateConverter::getChannelCount() {
|
int SampleRateConverter::getSampleIndex() {
|
||||||
return 1;
|
return nextOutputSampleIndex;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool SampleRateConverter::getNextSample(float &sample) {
|
void SampleRateConverter::seek(int sampleIndex) {
|
||||||
if (nextOutputSampleIndex >= outputFrameCount) return false;
|
if (sampleIndex < 0 || sampleIndex >= outputFrameCount) throw std::invalid_argument("sampleIndex out of range.");
|
||||||
|
|
||||||
double start = nextOutputSampleIndex * downscalingFactor;
|
nextOutputSampleIndex = sampleIndex;
|
||||||
double end = (nextOutputSampleIndex + 1) * downscalingFactor;
|
}
|
||||||
|
|
||||||
|
float SampleRateConverter::readSample() {
|
||||||
|
if (nextOutputSampleIndex >= outputFrameCount) throw std::out_of_range("End of stream.");
|
||||||
|
|
||||||
|
double inputStart = nextOutputSampleIndex * downscalingFactor;
|
||||||
|
double inputEnd = (nextOutputSampleIndex + 1) * downscalingFactor;
|
||||||
|
|
||||||
sample = mean(start, end);
|
|
||||||
nextOutputSampleIndex++;
|
nextOutputSampleIndex++;
|
||||||
return true;
|
return mean(inputStart, inputEnd);
|
||||||
}
|
}
|
||||||
|
|
||||||
float SampleRateConverter::mean(double start, double end) {
|
float SampleRateConverter::mean(double inputStart, double inputEnd) {
|
||||||
// Calculate weighted sum...
|
// Calculate weighted sum...
|
||||||
double sum = 0;
|
double sum = 0;
|
||||||
|
|
||||||
// ... first sample (weight <= 1)
|
// ... first sample (weight <= 1)
|
||||||
int startIndex = static_cast<int>(start);
|
int startIndex = static_cast<int>(inputStart);
|
||||||
sum += getInputSample(startIndex) * ((startIndex + 1) - start);
|
sum += getInputSample(startIndex) * ((startIndex + 1) - inputStart);
|
||||||
|
|
||||||
// ... middle samples (weight 1 each)
|
// ... middle samples (weight 1 each)
|
||||||
int endIndex = static_cast<int>(end);
|
int endIndex = static_cast<int>(inputEnd);
|
||||||
for (int index = startIndex + 1; index < endIndex; index++) {
|
for (int index = startIndex + 1; index < endIndex; index++) {
|
||||||
sum += getInputSample(index);
|
sum += getInputSample(index);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ... last sample (weight < 1)
|
// ... last sample (weight < 1)
|
||||||
sum += getInputSample(endIndex) * (end - endIndex);
|
sum += getInputSample(endIndex) * (inputEnd - endIndex);
|
||||||
|
|
||||||
return static_cast<float>(sum / (end - start));
|
return static_cast<float>(sum / (inputEnd - inputStart));
|
||||||
}
|
}
|
||||||
|
|
||||||
float SampleRateConverter::getInputSample(int sampleIndex) {
|
float SampleRateConverter::getInputSample(int sampleIndex) {
|
||||||
|
sampleIndex = std::min(sampleIndex, inputStream->getSampleCount() - 1);
|
||||||
|
if (sampleIndex < 0) return 0.0f;
|
||||||
|
|
||||||
if (sampleIndex == lastInputSampleIndex) {
|
if (sampleIndex == lastInputSampleIndex) {
|
||||||
return lastInputSample;
|
return lastInputSample;
|
||||||
}
|
}
|
||||||
if (sampleIndex == lastInputSampleIndex + 1) {
|
|
||||||
lastInputSampleIndex++;
|
|
||||||
// Read the next sample.
|
|
||||||
// If the input stream has no more samples (at the very end),
|
|
||||||
// we'll just reuse the last sample we have.
|
|
||||||
inputStream->getNextSample(lastInputSample);
|
|
||||||
return lastInputSample;
|
|
||||||
}
|
|
||||||
|
|
||||||
throw runtime_error("Can only return the last sample or the one following it.");
|
if (sampleIndex != inputStream->getSampleIndex()) {
|
||||||
|
inputStream->seek(sampleIndex);
|
||||||
|
}
|
||||||
|
lastInputSample = inputStream->readSample();
|
||||||
|
lastInputSampleIndex = sampleIndex;
|
||||||
|
return lastInputSample;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,22 +1,21 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <vector>
|
|
||||||
#include "AudioStream.h"
|
#include "AudioStream.h"
|
||||||
|
|
||||||
class SampleRateConverter : public AudioStream {
|
class SampleRateConverter : public AudioStream {
|
||||||
public:
|
public:
|
||||||
SampleRateConverter(std::unique_ptr<AudioStream> inputStream, int outputFrameRate);
|
SampleRateConverter(std::unique_ptr<AudioStream> inputStream, int outputFrameRate);
|
||||||
virtual int getFrameRate() override;
|
SampleRateConverter(const SampleRateConverter& rhs, bool reset);
|
||||||
virtual int getFrameCount() override;
|
std::unique_ptr<AudioStream> clone(bool reset) override;
|
||||||
virtual int getChannelCount() override;
|
int getSampleRate() override;
|
||||||
virtual bool getNextSample(float &sample) override;
|
int getSampleCount() override;
|
||||||
|
int getSampleIndex() override;
|
||||||
|
void seek(int sampleIndex) override;
|
||||||
|
float readSample() override;
|
||||||
private:
|
private:
|
||||||
// The stream we're reading from
|
|
||||||
std::unique_ptr<AudioStream> inputStream;
|
std::unique_ptr<AudioStream> inputStream;
|
||||||
|
double downscalingFactor; // input frame rate / output frame rate
|
||||||
// input frame rate / output frame rate
|
|
||||||
double downscalingFactor;
|
|
||||||
|
|
||||||
int outputFrameRate;
|
int outputFrameRate;
|
||||||
int outputFrameCount;
|
int outputFrameCount;
|
||||||
|
|
|
@ -24,10 +24,12 @@ enum class Codec {
|
||||||
Float = 0x03
|
Float = 0x03
|
||||||
};
|
};
|
||||||
|
|
||||||
WaveFileReader::WaveFileReader(boost::filesystem::path filePath) {
|
WaveFileReader::WaveFileReader(boost::filesystem::path filePath) :
|
||||||
// Open file
|
filePath(filePath),
|
||||||
file.exceptions(std::ifstream::failbit | std::ifstream::badbit);
|
file(),
|
||||||
file.open(filePath, std::ios::binary);
|
sampleIndex(0)
|
||||||
|
{
|
||||||
|
openFile();
|
||||||
|
|
||||||
// Read header
|
// Read header
|
||||||
uint32_t rootChunkId = read<uint32_t>(file);
|
uint32_t rootChunkId = read<uint32_t>(file);
|
||||||
|
@ -42,21 +44,21 @@ WaveFileReader::WaveFileReader(boost::filesystem::path filePath) {
|
||||||
|
|
||||||
// Read chunks until we reach the data chunk
|
// Read chunks until we reach the data chunk
|
||||||
bool reachedDataChunk = false;
|
bool reachedDataChunk = false;
|
||||||
int bytesPerSample = 0;
|
bytesPerSample = 0;
|
||||||
do {
|
do {
|
||||||
uint32_t chunkId = read<uint32_t>(file);
|
uint32_t chunkId = read<uint32_t>(file);
|
||||||
int chunkSize = read<uint32_t>(file);
|
int chunkSize = read<uint32_t>(file);
|
||||||
switch (chunkId) {
|
switch (chunkId) {
|
||||||
case fourcc('f', 'm', 't', ' '): {
|
case fourcc('f', 'm', 't', ' '): {
|
||||||
// Read relevant data
|
// Read relevant data
|
||||||
Codec codec = (Codec) read<uint16_t>(file);
|
Codec codec = (Codec)read<uint16_t>(file);
|
||||||
channelCount = read<uint16_t>(file);
|
channelCount = read<uint16_t>(file);
|
||||||
frameRate = read<uint32_t>(file);
|
frameRate = read<uint32_t>(file);
|
||||||
read<uint32_t>(file); // Bytes per second
|
read<uint32_t>(file); // Bytes per second
|
||||||
int frameSize = read<uint16_t>(file);
|
int frameSize = read<uint16_t>(file);
|
||||||
int bitsPerSample = read<uint16_t>(file);
|
int bitsPerSample = read<uint16_t>(file);
|
||||||
|
|
||||||
// We're read 16 bytes so far. Skip the remainder.
|
// We've read 16 bytes so far. Skip the remainder.
|
||||||
file.seekg(roundToEven(chunkSize) - 16, file.cur);
|
file.seekg(roundToEven(chunkSize) - 16, file.cur);
|
||||||
|
|
||||||
// Determine sample format
|
// Determine sample format
|
||||||
|
@ -97,8 +99,9 @@ WaveFileReader::WaveFileReader(boost::filesystem::path filePath) {
|
||||||
}
|
}
|
||||||
case fourcc('d', 'a', 't', 'a'): {
|
case fourcc('d', 'a', 't', 'a'): {
|
||||||
reachedDataChunk = true;
|
reachedDataChunk = true;
|
||||||
remainingSamples = chunkSize / bytesPerSample;
|
dataOffset = file.tellg();
|
||||||
frameCount = remainingSamples / channelCount;
|
sampleCount = chunkSize / bytesPerSample;
|
||||||
|
frameCount = sampleCount / channelCount;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
default: {
|
default: {
|
||||||
|
@ -110,43 +113,79 @@ WaveFileReader::WaveFileReader(boost::filesystem::path filePath) {
|
||||||
} while (!reachedDataChunk);
|
} while (!reachedDataChunk);
|
||||||
}
|
}
|
||||||
|
|
||||||
int WaveFileReader::getFrameRate() {
|
WaveFileReader::WaveFileReader(const WaveFileReader& rhs, bool reset) :
|
||||||
|
filePath(rhs.filePath),
|
||||||
|
file(),
|
||||||
|
bytesPerSample(rhs.bytesPerSample),
|
||||||
|
sampleFormat(rhs.sampleFormat),
|
||||||
|
frameRate(rhs.frameRate),
|
||||||
|
frameCount(rhs.frameCount),
|
||||||
|
channelCount(rhs.channelCount),
|
||||||
|
sampleCount(rhs.sampleCount),
|
||||||
|
dataOffset(rhs.dataOffset),
|
||||||
|
sampleIndex(-1)
|
||||||
|
{
|
||||||
|
openFile();
|
||||||
|
seek(reset ? 0 : rhs.sampleIndex);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::unique_ptr<AudioStream> WaveFileReader::clone(bool reset) {
|
||||||
|
return std::make_unique<WaveFileReader>(*this, reset);
|
||||||
|
}
|
||||||
|
|
||||||
|
void WaveFileReader::openFile() {
|
||||||
|
file.exceptions(std::ifstream::failbit | std::ifstream::badbit);
|
||||||
|
file.open(filePath, std::ios::binary);
|
||||||
|
}
|
||||||
|
|
||||||
|
int WaveFileReader::getSampleRate() {
|
||||||
return frameRate;
|
return frameRate;
|
||||||
}
|
}
|
||||||
|
|
||||||
int WaveFileReader::getFrameCount() {
|
int WaveFileReader::getSampleCount() {
|
||||||
return frameCount;
|
return frameCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
int WaveFileReader::getChannelCount() {
|
int WaveFileReader::getSampleIndex() {
|
||||||
return channelCount;
|
return sampleIndex;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool WaveFileReader::getNextSample(float &sample) {
|
void WaveFileReader::seek(int sampleIndex) {
|
||||||
if (remainingSamples == 0) return false;
|
if (sampleIndex < 0 || sampleIndex >= sampleCount) throw std::invalid_argument("sampleIndex out of range.");
|
||||||
remainingSamples--;
|
|
||||||
|
|
||||||
|
file.seekg(dataOffset + sampleIndex * channelCount * bytesPerSample);
|
||||||
|
this->sampleIndex = sampleIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
float WaveFileReader::readSample() {
|
||||||
|
if (sampleIndex + channelCount > sampleCount) throw std::out_of_range("End of stream.");
|
||||||
|
sampleIndex += channelCount;
|
||||||
|
|
||||||
|
float sum = 0;
|
||||||
|
for (int channelIndex = 0; channelIndex < channelCount; channelIndex++) {
|
||||||
switch (sampleFormat) {
|
switch (sampleFormat) {
|
||||||
case SampleFormat::UInt8: {
|
case SampleFormat::UInt8: {
|
||||||
uint8_t raw = read<uint8_t>(file);
|
uint8_t raw = read<uint8_t>(file);
|
||||||
sample = toNormalizedFloat(raw, 0, UINT8_MAX);
|
sum += toNormalizedFloat(raw, 0, UINT8_MAX);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case SampleFormat::Int16: {
|
case SampleFormat::Int16: {
|
||||||
int16_t raw = read<int16_t>(file);
|
int16_t raw = read<int16_t>(file);
|
||||||
sample = toNormalizedFloat(raw, INT16_MIN, INT16_MAX);
|
sum += toNormalizedFloat(raw, INT16_MIN, INT16_MAX);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case SampleFormat::Int24: {
|
case SampleFormat::Int24: {
|
||||||
int raw = read<int, 24>(file);
|
int raw = read<int, 24>(file);
|
||||||
if (raw & 0x800000) raw |= 0xFF000000; // Fix two's complement
|
if (raw & 0x800000) raw |= 0xFF000000; // Fix two's complement
|
||||||
sample = toNormalizedFloat(raw, INT24_MIN, INT24_MAX);
|
sum += toNormalizedFloat(raw, INT24_MIN, INT24_MAX);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case SampleFormat::Float32: {
|
case SampleFormat::Float32: {
|
||||||
sample = read<float>(file);
|
sum += read<float>(file);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return true;
|
}
|
||||||
|
|
||||||
|
return sum / channelCount;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,8 +1,5 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <string>
|
|
||||||
#include <cstdint>
|
|
||||||
#include <fstream>
|
|
||||||
#include <boost/filesystem/path.hpp>
|
#include <boost/filesystem/path.hpp>
|
||||||
#include <boost/filesystem/fstream.hpp>
|
#include <boost/filesystem/fstream.hpp>
|
||||||
#include "AudioStream.h"
|
#include "AudioStream.h"
|
||||||
|
@ -17,16 +14,26 @@ enum class SampleFormat {
|
||||||
class WaveFileReader : public AudioStream {
|
class WaveFileReader : public AudioStream {
|
||||||
public:
|
public:
|
||||||
WaveFileReader(boost::filesystem::path filePath);
|
WaveFileReader(boost::filesystem::path filePath);
|
||||||
virtual int getFrameRate() override ;
|
WaveFileReader(const WaveFileReader& rhs, bool reset);
|
||||||
virtual int getFrameCount() override;
|
std::unique_ptr<AudioStream> clone(bool reset) override;
|
||||||
virtual int getChannelCount() override;
|
int getSampleRate() override ;
|
||||||
virtual bool getNextSample(float &sample) override;
|
int getSampleCount() override;
|
||||||
|
int getSampleIndex() override;
|
||||||
|
void seek(int sampleIndex) override;
|
||||||
|
float readSample() override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
void openFile();
|
||||||
|
|
||||||
|
private:
|
||||||
|
boost::filesystem::path filePath;
|
||||||
boost::filesystem::ifstream file;
|
boost::filesystem::ifstream file;
|
||||||
|
int bytesPerSample;
|
||||||
SampleFormat sampleFormat;
|
SampleFormat sampleFormat;
|
||||||
int frameRate;
|
int frameRate;
|
||||||
int frameCount;
|
int frameCount;
|
||||||
int channelCount;
|
int channelCount;
|
||||||
int remainingSamples;
|
int sampleCount;
|
||||||
|
size_t dataOffset;
|
||||||
|
int sampleIndex;
|
||||||
};
|
};
|
||||||
|
|
|
@ -13,9 +13,9 @@ void createWaveFile(std::unique_ptr<AudioStream> inputStream, std::string fileNa
|
||||||
// Write RIFF chunk
|
// Write RIFF chunk
|
||||||
write<uint32_t>(fourcc('R', 'I', 'F', 'F'), file);
|
write<uint32_t>(fourcc('R', 'I', 'F', 'F'), file);
|
||||||
uint32_t formatChunkSize = 16;
|
uint32_t formatChunkSize = 16;
|
||||||
uint16_t channelCount = static_cast<uint16_t>(inputStream->getChannelCount());
|
uint16_t channelCount = 1;
|
||||||
uint16_t frameSize = static_cast<uint16_t>(channelCount * sizeof(float));
|
uint16_t frameSize = static_cast<uint16_t>(channelCount * sizeof(float));
|
||||||
uint32_t dataChunkSize = static_cast<uint32_t>(inputStream->getFrameCount() * frameSize);
|
uint32_t dataChunkSize = static_cast<uint32_t>(inputStream->getSampleCount() * frameSize);
|
||||||
uint32_t riffChunkSize = 4 + (8 + formatChunkSize) + (8 + dataChunkSize);
|
uint32_t riffChunkSize = 4 + (8 + formatChunkSize) + (8 + dataChunkSize);
|
||||||
write<uint32_t>(riffChunkSize, file);
|
write<uint32_t>(riffChunkSize, file);
|
||||||
write<uint32_t>(fourcc('W', 'A', 'V', 'E'), file);
|
write<uint32_t>(fourcc('W', 'A', 'V', 'E'), file);
|
||||||
|
@ -26,7 +26,7 @@ void createWaveFile(std::unique_ptr<AudioStream> inputStream, std::string fileNa
|
||||||
uint16_t codec = 0x03; // 32-bit float
|
uint16_t codec = 0x03; // 32-bit float
|
||||||
write<uint16_t>(codec, file);
|
write<uint16_t>(codec, file);
|
||||||
write<uint16_t>(channelCount, file);
|
write<uint16_t>(channelCount, file);
|
||||||
uint32_t frameRate = static_cast<uint16_t>(inputStream->getFrameRate());
|
uint32_t frameRate = static_cast<uint16_t>(inputStream->getSampleRate());
|
||||||
write<uint32_t>(frameRate, file);
|
write<uint32_t>(frameRate, file);
|
||||||
uint32_t bytesPerSecond = frameRate * frameSize;
|
uint32_t bytesPerSecond = frameRate * frameSize;
|
||||||
write<uint32_t>(bytesPerSecond, file);
|
write<uint32_t>(bytesPerSecond, file);
|
||||||
|
@ -37,8 +37,8 @@ void createWaveFile(std::unique_ptr<AudioStream> inputStream, std::string fileNa
|
||||||
// Write data chunk
|
// Write data chunk
|
||||||
write<uint32_t>(fourcc('d', 'a', 't', 'a'), file);
|
write<uint32_t>(fourcc('d', 'a', 't', 'a'), file);
|
||||||
write<uint32_t>(dataChunkSize, file);
|
write<uint32_t>(dataChunkSize, file);
|
||||||
float sample;
|
while (!inputStream->endOfStream()) {
|
||||||
while (inputStream->getNextSample(sample)) {
|
float sample = inputStream->readSample();
|
||||||
write<float>(sample, file);
|
write<float>(sample, file);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -107,7 +107,7 @@ int main(int argc, char *argv[]) {
|
||||||
{
|
{
|
||||||
ProgressBar progressBar;
|
ProgressBar progressBar;
|
||||||
phones = detectPhones(
|
phones = detectPhones(
|
||||||
[&inputFileName]() { return createAudioStream(inputFileName.getValue()); },
|
createAudioStream(inputFileName.getValue()),
|
||||||
dialog.getValue(),
|
dialog.getValue(),
|
||||||
progressBar);
|
progressBar);
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,7 +3,6 @@
|
||||||
#include <boost/algorithm/string.hpp>
|
#include <boost/algorithm/string.hpp>
|
||||||
#include "phoneExtraction.h"
|
#include "phoneExtraction.h"
|
||||||
#include "audioInput/SampleRateConverter.h"
|
#include "audioInput/SampleRateConverter.h"
|
||||||
#include "audioInput/ChannelDownmixer.h"
|
|
||||||
#include "platformTools.h"
|
#include "platformTools.h"
|
||||||
#include "tools.h"
|
#include "tools.h"
|
||||||
#include <format.h>
|
#include <format.h>
|
||||||
|
@ -33,17 +32,12 @@ using std::regex;
|
||||||
using std::regex_replace;
|
using std::regex_replace;
|
||||||
using std::chrono::duration;
|
using std::chrono::duration;
|
||||||
|
|
||||||
unique_ptr<AudioStream> to16kHzMono(unique_ptr<AudioStream> stream) {
|
unique_ptr<AudioStream> to16kHz(unique_ptr<AudioStream> stream) {
|
||||||
// Downmix, if required
|
|
||||||
if (stream->getChannelCount() != 1) {
|
|
||||||
stream.reset(new ChannelDownmixer(std::move(stream)));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Downsample, if required
|
// Downsample, if required
|
||||||
if (stream->getFrameRate() < 16000) {
|
if (stream->getSampleRate() < 16000) {
|
||||||
throw invalid_argument("Audio sample rate must not be below 16kHz.");
|
throw invalid_argument("Audio sample rate must not be below 16kHz.");
|
||||||
}
|
}
|
||||||
if (stream->getFrameRate() != 16000) {
|
if (stream->getSampleRate() != 16000) {
|
||||||
stream.reset(new SampleRateConverter(std::move(stream), 16000));
|
stream.reset(new SampleRateConverter(std::move(stream), 16000));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -88,7 +82,7 @@ int16_t floatSampleToInt16(float sample) {
|
||||||
return static_cast<int16_t>(((sample + 1) / 2) * (INT16_MAX - INT16_MIN) + INT16_MIN);
|
return static_cast<int16_t>(((sample + 1) / 2) * (INT16_MAX - INT16_MIN) + INT16_MIN);
|
||||||
}
|
}
|
||||||
|
|
||||||
void processAudioStream(AudioStream& audioStream16kHzMono, function<void(const vector<int16_t>&)> processBuffer, ProgressSink& progressSink) {
|
void processAudioStream(AudioStream& audioStream16kHz, function<void(const vector<int16_t>&)> processBuffer, ProgressSink& progressSink) {
|
||||||
// Process entire sound file
|
// Process entire sound file
|
||||||
vector<int16_t> buffer;
|
vector<int16_t> buffer;
|
||||||
const int capacity = 1600; // 0.1 second capacity
|
const int capacity = 1600; // 0.1 second capacity
|
||||||
|
@ -97,10 +91,9 @@ void processAudioStream(AudioStream& audioStream16kHzMono, function<void(const v
|
||||||
do {
|
do {
|
||||||
// Read to buffer
|
// Read to buffer
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
while (buffer.size() < capacity) {
|
while (buffer.size() < capacity && !audioStream16kHz.endOfStream()) {
|
||||||
// Read sample
|
// Read sample
|
||||||
float floatSample;
|
float floatSample = audioStream16kHz.readSample();
|
||||||
if (!audioStream16kHzMono.getNextSample(floatSample)) break;
|
|
||||||
int16_t sample = floatSampleToInt16(floatSample);
|
int16_t sample = floatSampleToInt16(floatSample);
|
||||||
buffer.push_back(sample);
|
buffer.push_back(sample);
|
||||||
}
|
}
|
||||||
|
@ -109,7 +102,7 @@ void processAudioStream(AudioStream& audioStream16kHzMono, function<void(const v
|
||||||
processBuffer(buffer);
|
processBuffer(buffer);
|
||||||
|
|
||||||
sampleCount += buffer.size();
|
sampleCount += buffer.size();
|
||||||
progressSink.reportProgress(static_cast<double>(sampleCount) / audioStream16kHzMono.getFrameCount());
|
progressSink.reportProgress(static_cast<double>(sampleCount) / audioStream16kHz.getSampleCount());
|
||||||
} while (buffer.size());
|
} while (buffer.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -158,7 +151,7 @@ void sphinxLogCallback(void* user_data, err_lvl_t errorLevel, const char* format
|
||||||
|
|
||||||
vector<string> recognizeWords(unique_ptr<AudioStream> audioStream, ps_decoder_t& recognizer, ProgressSink& progressSink) {
|
vector<string> recognizeWords(unique_ptr<AudioStream> audioStream, ps_decoder_t& recognizer, ProgressSink& progressSink) {
|
||||||
// Convert audio stream to the exact format PocketSphinx requires
|
// Convert audio stream to the exact format PocketSphinx requires
|
||||||
audioStream = to16kHzMono(std::move(audioStream));
|
audioStream = to16kHz(std::move(audioStream));
|
||||||
|
|
||||||
// Start recognition
|
// Start recognition
|
||||||
int error = ps_start_utt(&recognizer);
|
int error = ps_start_utt(&recognizer);
|
||||||
|
@ -243,7 +236,7 @@ map<centiseconds, Phone> getPhoneAlignment(const vector<s3wid_t>& wordIds, uniqu
|
||||||
if (error) throw runtime_error("Error populating alignment struct.");
|
if (error) throw runtime_error("Error populating alignment struct.");
|
||||||
|
|
||||||
// Convert audio stream to the exact format PocketSphinx requires
|
// Convert audio stream to the exact format PocketSphinx requires
|
||||||
audioStream = to16kHzMono(std::move(audioStream));
|
audioStream = to16kHz(std::move(audioStream));
|
||||||
|
|
||||||
// Create search structure
|
// Create search structure
|
||||||
acmod_t* acousticModel = recognizer.acmod;
|
acmod_t* acousticModel = recognizer.acmod;
|
||||||
|
@ -304,7 +297,7 @@ map<centiseconds, Phone> getPhoneAlignment(const vector<s3wid_t>& wordIds, uniqu
|
||||||
}
|
}
|
||||||
|
|
||||||
map<centiseconds, Phone> detectPhones(
|
map<centiseconds, Phone> detectPhones(
|
||||||
std::function<std::unique_ptr<AudioStream>(void)> createAudioStream,
|
unique_ptr<AudioStream> audioStream,
|
||||||
boost::optional<std::string> dialog,
|
boost::optional<std::string> dialog,
|
||||||
ProgressSink& progressSink)
|
ProgressSink& progressSink)
|
||||||
{
|
{
|
||||||
|
@ -329,13 +322,13 @@ map<centiseconds, Phone> detectPhones(
|
||||||
// Get words
|
// Get words
|
||||||
vector<string> words = dialog
|
vector<string> words = dialog
|
||||||
? extractDialogWords(*dialog)
|
? extractDialogWords(*dialog)
|
||||||
: recognizeWords(createAudioStream(), *recognizer.get(), wordRecognitionProgressSink);
|
: recognizeWords(audioStream->clone(true), *recognizer.get(), wordRecognitionProgressSink);
|
||||||
|
|
||||||
// Look up words in dictionary
|
// Look up words in dictionary
|
||||||
vector<s3wid_t> wordIds = getWordIds(words, *recognizer->dict);
|
vector<s3wid_t> wordIds = getWordIds(words, *recognizer->dict);
|
||||||
|
|
||||||
// Align the word's phones with speech
|
// Align the word's phones with speech
|
||||||
map<centiseconds, Phone> result = getPhoneAlignment(wordIds, createAudioStream(), *recognizer.get(), alignmentProgressSink);
|
map<centiseconds, Phone> result = getPhoneAlignment(wordIds, std::move(audioStream), *recognizer.get(), alignmentProgressSink);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
catch (...) {
|
catch (...) {
|
||||||
|
|
|
@ -2,7 +2,6 @@
|
||||||
|
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <functional>
|
|
||||||
#include "audioInput/AudioStream.h"
|
#include "audioInput/AudioStream.h"
|
||||||
#include "Phone.h"
|
#include "Phone.h"
|
||||||
#include "centiseconds.h"
|
#include "centiseconds.h"
|
||||||
|
@ -10,6 +9,6 @@
|
||||||
#include <boost/optional/optional.hpp>
|
#include <boost/optional/optional.hpp>
|
||||||
|
|
||||||
std::map<centiseconds, Phone> detectPhones(
|
std::map<centiseconds, Phone> detectPhones(
|
||||||
std::function<std::unique_ptr<AudioStream>(void)> createAudioStream,
|
std::unique_ptr<AudioStream> audioStream,
|
||||||
boost::optional<std::string> dialog,
|
boost::optional<std::string> dialog,
|
||||||
ProgressSink& progressSink);
|
ProgressSink& progressSink);
|
||||||
|
|
Loading…
Reference in New Issue