Implemented WAVE reading, writing, and conversion

This commit is contained in:
Daniel Wolf 2015-09-17 21:47:58 +02:00
parent 31d3867708
commit 641f64022d
16 changed files with 529 additions and 4 deletions

View File

@ -3,6 +3,10 @@
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/CMakeLists.txt" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/lib/cppformat/format.cc" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/lib/cppformat/format.h" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/lib/cppformat/posix.cc" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/lib/cppformat/posix.h" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/lib/pocketsphinx/src/libpocketsphinx/acmod.c" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/lib/pocketsphinx/src/libpocketsphinx/acmod.h" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/lib/pocketsphinx/src/libpocketsphinx/allphone_search.c" isTestSource="false" />
@ -122,8 +126,19 @@
<sourceFolder url="file://$MODULE_DIR$/lib/sphinxbase/src/libsphinxbase/util/slamch.c" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/lib/sphinxbase/src/libsphinxbase/util/slapack_lite.c" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/lib/sphinxbase/src/libsphinxbase/util/strfuncs.c" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/main.cpp" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/tmp.cpp" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/16kHzMonoStream.cpp" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/16kHzMonoStream.h" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/AudioStream.h" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/ChannelDownmixer.cpp" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/ChannelDownmixer.h" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/IOTools.h" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/SampleRateConverter.cpp" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/SampleRateConverter.h" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/WaveFileReader.cpp" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/WaveFileReader.h" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/WaveFileWriter.cpp" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/audio_input/WaveFileWriter.h" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/main.cpp" isTestSource="false" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
@ -135,7 +150,9 @@
<root url="file://C:/MinGW/lib/gcc/mingw32/4.8.1/include-fixed" />
<root url="file://C:/MinGW/lib/gcc/mingw32/4.8.1/include/c++" />
<root url="file://C:/MinGW/lib/gcc/mingw32/4.8.1/include/ssp" />
<root url="file://$MODULE_DIR$/lib/cppformat" />
<root url="file://$MODULE_DIR$/lib/pocketsphinx/include" />
<root url="file://$MODULE_DIR$/lib/r8brain-free-src-1.6" />
<root url="file://$MODULE_DIR$/lib/sphinxbase/include" />
</CLASSES>
<SOURCES>
@ -144,9 +161,20 @@
<root url="file://C:/MinGW/lib/gcc/mingw32/4.8.1/include-fixed" />
<root url="file://C:/MinGW/lib/gcc/mingw32/4.8.1/include/c++" />
<root url="file://C:/MinGW/lib/gcc/mingw32/4.8.1/include/ssp" />
<root url="file://$MODULE_DIR$/lib/cppformat" />
<root url="file://$MODULE_DIR$/lib/pocketsphinx/include" />
<root url="file://$MODULE_DIR$/lib/r8brain-free-src-1.6" />
<root url="file://$MODULE_DIR$/lib/sphinxbase/include" />
</SOURCES>
<excluded>
<root url="file://$MODULE_DIR$/lib/cppformat/posix.cc" />
<root url="file://$MODULE_DIR$/lib/cppformat/posix.h" />
<root url="file://$MODULE_DIR$/lib/cppformat/format.h" />
<root url="file://$MODULE_DIR$/lib/cppformat/format.cc" />
<root url="file://$MODULE_DIR$/lib/r8brain-free-src-1.6/r8bbase.h" />
<root url="file://$MODULE_DIR$/lib/r8brain-free-src-1.6/example.cpp" />
<root url="file://$MODULE_DIR$/lib/r8brain-free-src-1.6/r8bbase.cpp" />
</excluded>
</library>
</orderEntry>
</component>

View File

@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="" vcs="" />
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

View File

@ -3,7 +3,7 @@ project(LipSync)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
set(SOURCE_FILES main.cpp)
set(SOURCE_FILES src/main.cpp src/audio_input/WaveFileReader.cpp src/audio_input/WaveFileReader.h src/audio_input/ChannelDownmixer.cpp src/audio_input/ChannelDownmixer.h src/audio_input/AudioStream.h src/audio_input/SampleRateConverter.cpp src/audio_input/SampleRateConverter.h src/audio_input/16kHzMonoStream.cpp src/audio_input/16kHzMonoStream.h src/audio_input/WaveFileWriter.cpp src/audio_input/WaveFileWriter.h src/audio_input/IOTools.h)
include_directories("lib/sphinxbase-5prealpha-2015-08-05/include" "lib/pocketsphinx-5prealpha-2015-08-05/include" "lib/cppformat")
FILE(GLOB_RECURSE SPHINX_BASE "lib/sphinxbase-5prealpha-2015-08-05/src/libsphinxbase/*.c")

View File

@ -0,0 +1,26 @@
#include "16kHzMonoStream.h"
#include "WaveFileReader.h"
#include "ChannelDownmixer.h"
#include "SampleRateConverter.h"
using std::runtime_error;
std::unique_ptr<AudioStream> create16kHzMonoStream(std::string fileName) {
// Create audio stream
std::unique_ptr<AudioStream> stream(new WaveFileReader(fileName));
// Downmix, if required
if (stream->getChannelCount() != 1) {
stream.reset(new ChannelDownmixer(std::move(stream)));
}
// Downsample, if required
if (stream->getFrameRate() < 16000) {
throw runtime_error("Sample rate must not be below 16kHz.");
}
if (stream->getFrameRate() != 16000) {
stream.reset(new SampleRateConverter(std::move(stream), 16000));
}
return stream;
}

View File

@ -0,0 +1,10 @@
#ifndef LIPSYNC_WAVEFILEREADER16KHZMONO_H
#define LIPSYNC_WAVEFILEREADER16KHZMONO_H
#include "AudioStream.h"
#include <memory>
#include <string>
std::unique_ptr<AudioStream> create16kHzMonoStream(std::string fileName);
#endif //LIPSYNC_WAVEFILEREADER16KHZMONO_H

View File

@ -0,0 +1,12 @@
#ifndef LIPSYNC_AUDIOSTREAM_H
#define LIPSYNC_AUDIOSTREAM_H
class AudioStream {
public:
virtual int getFrameRate() = 0;
virtual int getFrameCount() = 0;
virtual int getChannelCount() = 0;
virtual bool getNextSample(float &sample) = 0;
};
#endif //LIPSYNC_AUDIOSTREAM_H

View File

@ -0,0 +1,31 @@
#include "ChannelDownmixer.h"
ChannelDownmixer::ChannelDownmixer(std::unique_ptr<AudioStream> inputStream) :
inputStream(std::move(inputStream)),
inputChannelCount(this->inputStream->getChannelCount())
{}
int ChannelDownmixer::getFrameRate() {
return inputStream->getFrameRate();
}
int ChannelDownmixer::getFrameCount() {
return inputStream->getFrameCount();
}
int ChannelDownmixer::getChannelCount() {
return 1;
}
bool ChannelDownmixer::getNextSample(float &sample) {
float sum = 0;
for (int channelIndex = 0; channelIndex < inputChannelCount; channelIndex++) {
float currentSample;
if (!inputStream->getNextSample(currentSample)) return false;
sum += currentSample;
}
sample = sum / inputChannelCount;
return true;
}

View File

@ -0,0 +1,21 @@
#ifndef LIPSYNC_CHANNELDOWNMIXER_H
#define LIPSYNC_CHANNELDOWNMIXER_H
#include "AudioStream.h"
#include <memory>
// Converts a multi-channel audio stream to mono.
class ChannelDownmixer : public AudioStream {
public:
ChannelDownmixer(std::unique_ptr<AudioStream> inputStream);
virtual int getFrameRate() override;
virtual int getFrameCount() override;
virtual int getChannelCount() override;
virtual bool getNextSample(float &sample) override;
private:
std::unique_ptr<AudioStream> inputStream;
int inputChannelCount;
};
#endif //LIPSYNC_CHANNELDOWNMIXER_H

44
src/audio_input/IOTools.h Normal file
View File

@ -0,0 +1,44 @@
#ifndef LIPSYNC_IOTOOLS_H
#define LIPSYNC_IOTOOLS_H
#include <fstream>
namespace little_endian {
template <typename Type, int bitsToRead = 8 * sizeof(Type)>
Type read(std::istream &stream) {
static_assert(bitsToRead % 8 == 0, "Cannot read fractional bytes.");
static_assert(bitsToRead <= sizeof(Type) * 8, "Bits to read exceed target type size.");
Type result = 0;
char *p = reinterpret_cast<char*>(&result);
int bytesToRead = bitsToRead / 8;
for (int byteIndex = 0; byteIndex < bytesToRead; byteIndex++) {
*(p + byteIndex) = static_cast<char>(stream.get());
}
return result;
}
template <typename Type, int bitsToWrite = 8 * sizeof(Type)>
void write(Type value, std::ostream &stream) {
static_assert(bitsToWrite % 8 == 0, "Cannot write fractional bytes.");
static_assert(bitsToWrite <= sizeof(Type) * 8, "Bits to write exceed target type size.");
char *p = reinterpret_cast<char*>(&value);
int bytesToWrite = bitsToWrite / 8;
for (int byteIndex = 0; byteIndex < bytesToWrite; byteIndex++) {
stream.put(*(p + byteIndex));
}
}
constexpr uint32_t fourcc(unsigned char c0, unsigned char c1, unsigned char c2, unsigned char c3) {
return c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
}
inline std::string fourccToString(uint32_t fourcc) {
return std::string(reinterpret_cast<char*>(&fourcc), 4);
}
}
#endif //LIPSYNC_IOTOOLS_H

View File

@ -0,0 +1,80 @@
#include <cmath>
#include "SampleRateConverter.h"
using std::runtime_error;
SampleRateConverter::SampleRateConverter(std::unique_ptr<AudioStream> inputStream, int outputFrameRate) :
inputStream(std::move(inputStream)),
downscalingFactor(static_cast<double>(this->inputStream->getFrameRate()) / outputFrameRate),
outputFrameRate(outputFrameRate),
outputFrameCount(std::lround(this->inputStream->getFrameCount() / downscalingFactor)),
lastInputSample(0),
lastInputSampleIndex(-1),
nextOutputSampleIndex(0)
{
if (this->inputStream->getChannelCount() != 1) {
throw runtime_error("Only mono input streams are supported.");
}
if (this->inputStream->getFrameRate() < outputFrameRate) {
throw runtime_error("Upsampling not supported.");
}
}
int SampleRateConverter::getFrameRate() {
return outputFrameRate;
}
int SampleRateConverter::getFrameCount() {
return outputFrameCount;
}
int SampleRateConverter::getChannelCount() {
return 1;
}
bool SampleRateConverter::getNextSample(float &sample) {
if (nextOutputSampleIndex >= outputFrameCount) return false;
double start = nextOutputSampleIndex * downscalingFactor;
double end = (nextOutputSampleIndex + 1) * downscalingFactor;
sample = mean(start, end);
nextOutputSampleIndex++;
return true;
}
float SampleRateConverter::mean(double start, double end) {
// Calculate weighted sum...
double sum = 0;
// ... first sample (weight <= 1)
int startIndex = static_cast<int>(start);
sum += getInputSample(startIndex) * ((startIndex + 1) - start);
// ... middle samples (weight 1 each)
int endIndex = static_cast<int>(end);
for (int index = startIndex + 1; index < endIndex; index++) {
sum += getInputSample(index);
}
// ... last sample (weight < 1)
sum += getInputSample(endIndex) * (end - endIndex);
return static_cast<float>(sum / (end - start));
}
float SampleRateConverter::getInputSample(int sampleIndex) {
if (sampleIndex == lastInputSampleIndex) {
return lastInputSample;
}
if (sampleIndex == lastInputSampleIndex + 1) {
lastInputSampleIndex++;
// Read the next sample.
// If the input stream has no more samples (at the very end),
// we'll just reuse the last sample we have.
inputStream->getNextSample(lastInputSample);
return lastInputSample;
}
throw runtime_error("Can only return the last sample or the one following it.");
}

View File

@ -0,0 +1,34 @@
#ifndef LIPSYNC_SAMPLERATECONVERTER_H
#define LIPSYNC_SAMPLERATECONVERTER_H
#include <memory>
#include <vector>
#include "AudioStream.h"
class SampleRateConverter : public AudioStream {
public:
SampleRateConverter(std::unique_ptr<AudioStream> inputStream, int outputFrameRate);
virtual int getFrameRate() override;
virtual int getFrameCount() override;
virtual int getChannelCount() override;
virtual bool getNextSample(float &sample) override;
private:
// The stream we're reading from
std::unique_ptr<AudioStream> inputStream;
// input frame rate / output frame rate
double downscalingFactor;
int outputFrameRate;
int outputFrameCount;
float lastInputSample;
int lastInputSampleIndex;
int nextOutputSampleIndex;
float mean(double start, double end);
float getInputSample(int sampleIndex);
};
#endif //LIPSYNC_SAMPLERATECONVERTER_H

View File

@ -0,0 +1,152 @@
#include <format.h>
#include "WaveFileReader.h"
#include "IOTools.h"
using std::runtime_error;
using fmt::format;
using std::string;
using namespace little_endian;
#define INT24_MIN (-8388608)
#define INT24_MAX 8388607
// Converts an int in the range min..max to a float in the range -1..1
float toNormalizedFloat(int value, int min, int max) {
return (static_cast<float>(value - min) / (max - min) * 2) - 1;
}
int roundToEven(int i) {
return (i + 1) & (~1);
}
enum class Codec {
PCM = 0x01,
Float = 0x03
};
WaveFileReader::WaveFileReader(std::string fileName) {
// Open file
file.exceptions(std::ifstream::failbit | std::ifstream::badbit);
file.open(fileName, std::ios::binary);
// Read header
uint32_t rootChunkId = read<uint32_t>(file);
if (rootChunkId != fourcc('R', 'I', 'F', 'F')) {
throw runtime_error("Unknown file format. Only WAVE files are supported.");
}
read<uint32_t>(file); // Chunk size
uint32_t waveId = read<uint32_t>(file);
if (waveId != fourcc('W', 'A', 'V', 'E')) {
throw runtime_error(format("File format is not WAVE, but {}.", fourccToString(waveId)));
}
// Read chunks until we reach the data chunk
bool reachedDataChunk = false;
int bytesPerSample = 0;
do {
uint32_t chunkId = read<uint32_t>(file);
int chunkSize = read<uint32_t>(file);
switch (chunkId) {
case fourcc('f', 'm', 't', ' '): {
// Read relevant data
Codec codec = (Codec) read<uint16_t>(file);
channelCount = read<uint16_t>(file);
frameRate = read<uint32_t>(file);
read<uint32_t>(file); // Bytes per second
int frameSize = read<uint16_t>(file);
int bitsPerSample = read<uint16_t>(file);
// We're read 16 bytes so far. Skip the remainder.
file.seekg(roundToEven(chunkSize) - 16, file.cur);
// Determine sample format
switch (codec) {
case Codec::PCM:
// Determine sample size.
// According to the WAVE standard, sample sizes that are not multiples of 8 bits
// (e.g. 12 bits) can be treated like the next-larger byte size.
if (bitsPerSample == 8) {
sampleFormat = SampleFormat::UInt8;
bytesPerSample = 1;
} else if (bitsPerSample <= 16) {
sampleFormat = SampleFormat::Int16;
bytesPerSample = 2;
} else if (bitsPerSample <= 24) {
sampleFormat = SampleFormat::Int24;
bytesPerSample = 3;
} else {
throw runtime_error(
format("Unsupported sample format: {}-bit integer samples.", bitsPerSample));
}
if (bytesPerSample != frameSize / channelCount) {
throw runtime_error("Unsupported sample organization.");
}
break;
case Codec::Float:
if (bitsPerSample == 32) {
sampleFormat = SampleFormat::Float32;
bytesPerSample = 4;
} else {
throw runtime_error(format("Unsupported sample format: {}-bit floating-point samples.", bitsPerSample));
}
break;
default:
throw runtime_error("Unsupported sample format. Only uncompressed formats are supported.");
}
break;
}
case fourcc('d', 'a', 't', 'a'): {
reachedDataChunk = true;
remainingSamples = chunkSize / bytesPerSample;
frameCount = remainingSamples / channelCount;
break;
}
default: {
// Skip unknown chunk
file.seekg(roundToEven(chunkSize), file.cur);
break;
}
}
} while (!reachedDataChunk);
}
int WaveFileReader::getFrameRate() {
return frameRate;
}
int WaveFileReader::getFrameCount() {
return frameCount;
}
int WaveFileReader::getChannelCount() {
return channelCount;
}
bool WaveFileReader::getNextSample(float &sample) {
if (remainingSamples == 0) return false;
remainingSamples--;
switch (sampleFormat) {
case SampleFormat::UInt8: {
uint8_t raw = read<uint8_t>(file);
sample = toNormalizedFloat(raw, 0, UINT8_MAX);
break;
}
case SampleFormat::Int16: {
int16_t raw = read<int16_t>(file);
sample = toNormalizedFloat(raw, INT16_MIN, INT16_MAX);
break;
}
case SampleFormat::Int24: {
int raw = read<int, 24>(file);
if (raw & 0x800000) raw |= 0xFF000000; // Fix two's complement
sample = toNormalizedFloat(raw, INT24_MIN, INT24_MAX);
break;
}
case SampleFormat::Float32: {
sample = read<float>(file);
break;
}
}
return true;
}

View File

@ -0,0 +1,33 @@
#ifndef LIPSYNC_WAVFILEREADER_H
#define LIPSYNC_WAVFILEREADER_H
#include <string>
#include <cstdint>
#include <fstream>
#include "AudioStream.h"
enum class SampleFormat {
UInt8,
Int16,
Int24,
Float32
};
class WaveFileReader : public AudioStream {
public:
WaveFileReader(std::string fileName);
virtual int getFrameRate() override ;
virtual int getFrameCount() override;
virtual int getChannelCount() override;
virtual bool getNextSample(float &sample) override;
private:
std::ifstream file;
SampleFormat sampleFormat;
int frameRate;
int frameCount;
int channelCount;
int remainingSamples;
};
#endif //LIPSYNC_WAVFILEREADER_H

View File

@ -0,0 +1,44 @@
#include <fstream>
#include "WaveFileWriter.h"
#include "IOTools.h"
using namespace little_endian;
void createWaveFile(std::unique_ptr<AudioStream> inputStream, std::string fileName) {
// Open file
std::ofstream file;
file.exceptions(std::ofstream::failbit | std::ofstream::badbit);
file.open(fileName, std::ios::out | std::ios::binary);
// Write RIFF chunk
write<uint32_t>(fourcc('R', 'I', 'F', 'F'), file);
uint32_t formatChunkSize = 16;
uint16_t channelCount = static_cast<uint16_t>(inputStream->getChannelCount());
uint16_t frameSize = static_cast<uint16_t>(channelCount * sizeof(float));
uint32_t dataChunkSize = static_cast<uint32_t>(inputStream->getFrameCount() * frameSize);
uint32_t riffChunkSize = 4 + (8 + formatChunkSize) + (8 + dataChunkSize);
write<uint32_t>(riffChunkSize, file);
write<uint32_t>(fourcc('W', 'A', 'V', 'E'), file);
// Write format chunk
write<uint32_t>(fourcc('f', 'm', 't', ' '), file);
write<uint32_t>(formatChunkSize, file);
uint16_t codec = 0x03; // 32-bit float
write<uint16_t>(codec, file);
write<uint16_t>(channelCount, file);
uint32_t frameRate = static_cast<uint16_t>(inputStream->getFrameRate());
write<uint32_t>(frameRate, file);
uint32_t bytesPerSecond = frameRate * frameSize;
write<uint32_t>(bytesPerSecond, file);
write<uint16_t>(frameSize, file);
uint16_t bitsPerSample = 8 * sizeof(float);
write<uint16_t>(bitsPerSample, file);
// Write data chunk
write<uint32_t>(fourcc('d', 'a', 't', 'a'), file);
write<uint32_t>(dataChunkSize, file);
float sample;
while (inputStream->getNextSample(sample)) {
write<float>(sample, file);
}
}

View File

@ -0,0 +1,10 @@
#ifndef LIPSYNC_WAVEFILEWRITER_H
#define LIPSYNC_WAVEFILEWRITER_H
#include <memory>
#include <string>
#include "AudioStream.h"
void createWaveFile(std::unique_ptr<AudioStream> inputStream, std::string fileName);
#endif //LIPSYNC_WAVEFILEWRITER_H