Improve WAVE file reader to handle more formats

Fixes #101
This commit is contained in:
Daniel Wolf 2021-09-21 20:42:33 +02:00
parent 3c0befa070
commit 77588fb40e
5 changed files with 276 additions and 24 deletions

View File

@ -2,6 +2,7 @@
## Unreleased
* **Added** support for more WAVE file features ([issue #101](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/101))
* **Changed** Rhubarb Lip Sync for Spine so that it works with any modern JRE ([issue #97](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/97))
* **Changed** Windows build from 32 bit to 64 bit ([issue #98](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/98))

View File

@ -520,6 +520,7 @@ set(TEST_FILES
tests/tokenizationTests.cpp
tests/g2pTests.cpp
tests/LazyTests.cpp
tests/WaveFileReaderTests.cpp
)
add_executable(runTests ${TEST_FILES})
target_link_libraries(runTests
@ -528,6 +529,7 @@ target_link_libraries(runTests
gmock_main
rhubarb-recognition
rhubarb-time
rhubarb-audio
)
# Copies the specified files in a post-build event, then installs them
@ -555,9 +557,30 @@ function(copy_and_install sourceGlob relativeTargetDirectory)
endforeach()
endfunction()
# Copies the specified files in a post-build event
function(copy sourceGlob relativeTargetDirectory)
# Set `sourcePaths`
file(GLOB sourcePaths "${sourceGlob}")
foreach(sourcePath ${sourcePaths})
if(NOT IS_DIRECTORY ${sourcePath})
# Set `fileName`
get_filename_component(fileName "${sourcePath}" NAME)
# Copy file during build
add_custom_command(TARGET rhubarb POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy "${sourcePath}" "$<TARGET_FILE_DIR:rhubarb>/${relativeTargetDirectory}/${fileName}"
COMMENT "Creating '${relativeTargetDirectory}/${fileName}'"
)
endif()
endforeach()
endfunction()
copy_and_install("lib/pocketsphinx-rev13216/model/en-us/*" "res/sphinx")
copy_and_install("lib/cmusphinx-en-us-5.2/*" "res/sphinx/acoustic-model")
copy_and_install("tests/resources/*" "tests/resources")
install(
TARGETS rhubarb
RUNTIME

View File

@ -13,22 +13,27 @@ using std::unique_ptr;
using std::make_unique;
using std::make_shared;
using std::filesystem::path;
using std::streamoff;
#define INT24_MIN (-8388608)
#define INT24_MAX 8388607
// Converts an int in the range min..max to a float in the range -1..1
float toNormalizedFloat(int value, int min, int max) {
return (static_cast<float>(value - min) / (max - min) * 2) - 1;
const float fMin = static_cast<float>(min);
const float fMax = static_cast<float>(max);
const float fValue = static_cast<float>(value);
return ((fValue - fMin) / (fMax - fMin) * 2) - 1;
}
int roundToEven(int i) {
streamoff roundUpToEven(streamoff i) {
return (i + 1) & (~1);
}
namespace Codec {
constexpr int Pcm = 0x01;
constexpr int Float = 0x03;
constexpr int Extensible = 0xFFFE;
};
string codecToString(int codec);
@ -39,11 +44,11 @@ WaveFormatInfo getWaveFormatInfo(const path& filePath) {
auto file = openFile(filePath);
file.seekg(0, std::ios_base::end);
std::streamoff fileSize = file.tellg();
const streamoff fileSize = file.tellg();
file.seekg(0);
auto remaining = [&](int byteCount) {
const std::streamoff filePosition = file.tellg();
const streamoff filePosition = file.tellg();
return byteCount <= fileSize - filePosition;
};
@ -51,34 +56,46 @@ WaveFormatInfo getWaveFormatInfo(const path& filePath) {
if (!remaining(10)) {
throw runtime_error("WAVE file is corrupt. Header not found.");
}
auto rootChunkId = read<uint32_t>(file);
const auto rootChunkId = read<uint32_t>(file);
if (rootChunkId != fourcc('R', 'I', 'F', 'F')) {
throw runtime_error("Unknown file format. Only WAVE files are supported.");
}
read<uint32_t>(file); // Chunk size
uint32_t waveId = read<uint32_t>(file);
const uint32_t waveId = read<uint32_t>(file);
if (waveId != fourcc('W', 'A', 'V', 'E')) {
throw runtime_error(format("File format is not WAVE, but {}.", fourccToString(waveId)));
}
// Read chunks until we reach the data chunk
bool reachedDataChunk = false;
while (!reachedDataChunk && remaining(8)) {
uint32_t chunkId = read<uint32_t>(file);
int chunkSize = read<uint32_t>(file);
bool processedFormatChunk = false;
bool processedDataChunk = false;
while ((!processedFormatChunk || !processedDataChunk) && remaining(8)) {
const uint32_t chunkId = read<uint32_t>(file);
const streamoff chunkSize = read<int32_t>(file);
const streamoff chunkEnd = roundUpToEven(file.tellg() + chunkSize);
switch (chunkId) {
case fourcc('f', 'm', 't', ' '):
{
// Read relevant data
uint16_t codec = read<uint16_t>(file);
formatInfo.channelCount = read<uint16_t>(file);
formatInfo.frameRate = read<uint32_t>(file);
formatInfo.frameRate = read<int32_t>(file);
read<uint32_t>(file); // Bytes per second
int frameSize = read<uint16_t>(file);
int bitsPerSample = read<uint16_t>(file);
// We've read 16 bytes so far. Skip the remainder.
file.seekg(roundToEven(chunkSize) - 16, std::ios_base::cur);
const int bytesPerFrame = read<uint16_t>(file);
const int bitsPerSampleOnDisk = read<uint16_t>(file);
int bitsPerSample = bitsPerSampleOnDisk;
if (chunkSize > 16) {
const int extensionSize = read<uint16_t>(file);
if (extensionSize >= 22) {
// Read extension fields
bitsPerSample = read<uint16_t>(file);
read<uint32_t>(file); // Skip channel mask
const uint16_t codecOverride = read<uint16_t>(file);
if (codec == Codec::Extensible) {
codec = codecOverride;
}
}
}
// Determine sample format
int bytesPerSample;
@ -96,11 +113,14 @@ WaveFormatInfo getWaveFormatInfo(const path& filePath) {
} else if (bitsPerSample <= 24) {
formatInfo.sampleFormat = SampleFormat::Int24;
bytesPerSample = 3;
} else if (bitsPerSample <= 32) {
formatInfo.sampleFormat = SampleFormat::Int32;
bytesPerSample = 4;
} else {
throw runtime_error(
format("Unsupported sample format: {}-bit PCM.", bitsPerSample));
}
if (bytesPerSample != frameSize / formatInfo.channelCount) {
if (bytesPerSample != bytesPerFrame / formatInfo.channelCount) {
throw runtime_error("Unsupported sample organization.");
}
break;
@ -108,6 +128,9 @@ WaveFormatInfo getWaveFormatInfo(const path& filePath) {
if (bitsPerSample == 32) {
formatInfo.sampleFormat = SampleFormat::Float32;
bytesPerSample = 4;
} else if (bitsPerSample == 64) {
formatInfo.sampleFormat = SampleFormat::Float64;
bytesPerSample = 8;
} else {
throw runtime_error(
format("Unsupported sample format: {}-bit IEEE Float.", bitsPerSample)
@ -121,24 +144,30 @@ WaveFormatInfo getWaveFormatInfo(const path& filePath) {
));
}
formatInfo.bytesPerFrame = bytesPerSample * formatInfo.channelCount;
processedFormatChunk = true;
break;
}
case fourcc('d', 'a', 't', 'a'):
{
reachedDataChunk = true;
formatInfo.dataOffset = file.tellg();
formatInfo.frameCount = chunkSize / formatInfo.bytesPerFrame;
processedDataChunk = true;
break;
}
default:
{
// Skip unknown chunk
file.seekg(roundToEven(chunkSize), std::ios_base::cur);
// Ignore unknown chunk
break;
}
}
// Seek to end of chunk
file.seekg(chunkEnd, std::ios_base::beg);
}
if (!processedFormatChunk) throw runtime_error("Missing format chunk.");
if (!processedDataChunk) throw runtime_error("Missing data chunk.");
return formatInfo;
}
@ -177,11 +206,22 @@ inline AudioClip::value_type readSample(
sum += toNormalizedFloat(raw, INT24_MIN, INT24_MAX);
break;
}
case SampleFormat::Int32:
{
const int32_t raw = read<int32_t>(file);
sum += toNormalizedFloat(raw, INT32_MIN, INT32_MAX);
break;
}
case SampleFormat::Float32:
{
sum += read<float>(file);
break;
}
case SampleFormat::Float64:
{
sum += static_cast<float>(read<double>(file));
break;
}
}
}
@ -196,13 +236,13 @@ SampleReader WaveFileReader::createUnsafeSampleReader() const {
filePos = std::streampos(0)
](size_type index) mutable {
const std::streampos newFilePos = formatInfo.dataOffset
+ static_cast<std::streamoff>(index * formatInfo.bytesPerFrame);
+ static_cast<streamoff>(index * formatInfo.bytesPerFrame);
if (newFilePos != filePos) {
file->seekg(newFilePos);
}
const value_type result =
readSample(*file, formatInfo.sampleFormat, formatInfo.channelCount);
filePos = newFilePos + static_cast<std::streamoff>(formatInfo.bytesPerFrame);
filePos = newFilePos + static_cast<streamoff>(formatInfo.bytesPerFrame);
return result;
};
}
@ -454,4 +494,4 @@ string codecToString(int codec) {
default:
return format("{0:#x}", codec);
}
}
}

View File

@ -7,7 +7,9 @@ enum class SampleFormat {
UInt8,
Int16,
Int24,
Float32
Int32,
Float32,
Float64
};
struct WaveFormatInfo {

View File

@ -0,0 +1,186 @@
#include <gmock/gmock.h>
#include "audio/WaveFileReader.h"
#include "tools/platformTools.h"
using namespace testing;
TEST(getWaveFormatInfo, float32FromAudacity) {
auto formatInfo = getWaveFormatInfo(getBinDirectory() / "tests/resources/sine-triangle-float32-audacity.wav");
EXPECT_EQ(formatInfo.frameRate, 48000);
EXPECT_EQ(formatInfo.frameCount, 480000);
EXPECT_EQ(formatInfo.channelCount, 2);
EXPECT_EQ(formatInfo.sampleFormat, SampleFormat::Float32);
EXPECT_EQ(formatInfo.bytesPerFrame, 2 * 4);
EXPECT_EQ(formatInfo.dataOffset, 88);
}
TEST(getWaveFormatInfo, float32FromAudition) {
auto formatInfo = getWaveFormatInfo(getBinDirectory() / "tests/resources/sine-triangle-float32-audition.wav");
EXPECT_EQ(formatInfo.frameRate, 48000);
EXPECT_EQ(formatInfo.frameCount, 480000);
EXPECT_EQ(formatInfo.channelCount, 2);
EXPECT_EQ(formatInfo.sampleFormat, SampleFormat::Float32);
EXPECT_EQ(formatInfo.bytesPerFrame, 2 * 4);
EXPECT_EQ(formatInfo.dataOffset, 92);
}
TEST(getWaveFormatInfo, float32FromFfmpeg) {
auto formatInfo = getWaveFormatInfo(getBinDirectory() / "tests/resources/sine-triangle-float32-ffmpeg.wav");
EXPECT_EQ(formatInfo.frameRate, 48000);
EXPECT_EQ(formatInfo.frameCount, 480000);
EXPECT_EQ(formatInfo.channelCount, 2);
EXPECT_EQ(formatInfo.sampleFormat, SampleFormat::Float32);
EXPECT_EQ(formatInfo.bytesPerFrame, 2 * 4);
EXPECT_EQ(formatInfo.dataOffset, 114);
}
TEST(getWaveFormatInfo, float32FromSoundforge) {
auto formatInfo = getWaveFormatInfo(getBinDirectory() / "tests/resources/sine-triangle-float32-soundforge.wav");
EXPECT_EQ(formatInfo.frameRate, 48000);
EXPECT_EQ(formatInfo.frameCount, 480000);
EXPECT_EQ(formatInfo.channelCount, 2);
EXPECT_EQ(formatInfo.sampleFormat, SampleFormat::Float32);
EXPECT_EQ(formatInfo.bytesPerFrame, 2 * 4);
EXPECT_EQ(formatInfo.dataOffset, 44);
}
TEST(getWaveFormatInfo, float64FromFfmpeg) {
auto formatInfo = getWaveFormatInfo(getBinDirectory() / "tests/resources/sine-triangle-float64-ffmpeg.wav");
EXPECT_EQ(formatInfo.frameRate, 48000);
EXPECT_EQ(formatInfo.frameCount, 480000);
EXPECT_EQ(formatInfo.channelCount, 2);
EXPECT_EQ(formatInfo.sampleFormat, SampleFormat::Float64);
EXPECT_EQ(formatInfo.bytesPerFrame, 2 * 8);
EXPECT_EQ(formatInfo.dataOffset, 114);
}
TEST(getWaveFormatInfo, int16FromAudacity) {
auto formatInfo = getWaveFormatInfo(getBinDirectory() / "tests/resources/sine-triangle-int16-audacity.wav");
EXPECT_EQ(formatInfo.frameRate, 48000);
EXPECT_EQ(formatInfo.frameCount, 480000);
EXPECT_EQ(formatInfo.channelCount, 2);
EXPECT_EQ(formatInfo.sampleFormat, SampleFormat::Int16);
EXPECT_EQ(formatInfo.bytesPerFrame, 2 * 2);
EXPECT_EQ(formatInfo.dataOffset, 44);
}
TEST(getWaveFormatInfo, int16FromAudition) {
auto formatInfo = getWaveFormatInfo(getBinDirectory() / "tests/resources/sine-triangle-int16-audition.wav");
EXPECT_EQ(formatInfo.frameRate, 48000);
EXPECT_EQ(formatInfo.frameCount, 480000);
EXPECT_EQ(formatInfo.channelCount, 2);
EXPECT_EQ(formatInfo.sampleFormat, SampleFormat::Int16);
EXPECT_EQ(formatInfo.bytesPerFrame, 2 * 2);
EXPECT_EQ(formatInfo.dataOffset, 92);
}
TEST(getWaveFormatInfo, int16FromFfmpeg) {
auto formatInfo = getWaveFormatInfo(getBinDirectory() / "tests/resources/sine-triangle-int16-ffmpeg.wav");
EXPECT_EQ(formatInfo.frameRate, 48000);
EXPECT_EQ(formatInfo.frameCount, 480000);
EXPECT_EQ(formatInfo.channelCount, 2);
EXPECT_EQ(formatInfo.sampleFormat, SampleFormat::Int16);
EXPECT_EQ(formatInfo.bytesPerFrame, 2 * 2);
EXPECT_EQ(formatInfo.dataOffset, 78);
}
TEST(getWaveFormatInfo, int16FromSoundforge) {
auto formatInfo = getWaveFormatInfo(getBinDirectory() / "tests/resources/sine-triangle-int16-soundforge.wav");
EXPECT_EQ(formatInfo.frameRate, 48000);
EXPECT_EQ(formatInfo.frameCount, 480000);
EXPECT_EQ(formatInfo.channelCount, 2);
EXPECT_EQ(formatInfo.sampleFormat, SampleFormat::Int16);
EXPECT_EQ(formatInfo.bytesPerFrame, 2 * 2);
EXPECT_EQ(formatInfo.dataOffset, 44);
}
TEST(getWaveFormatInfo, int24FromAudacity) {
auto formatInfo = getWaveFormatInfo(getBinDirectory() / "tests/resources/sine-triangle-int24-audacity.wav");
EXPECT_EQ(formatInfo.frameRate, 48000);
EXPECT_EQ(formatInfo.frameCount, 480000);
EXPECT_EQ(formatInfo.channelCount, 2);
EXPECT_EQ(formatInfo.sampleFormat, SampleFormat::Int24);
EXPECT_EQ(formatInfo.bytesPerFrame, 2 * 3);
EXPECT_EQ(formatInfo.dataOffset, 44);
}
TEST(getWaveFormatInfo, int24FromAudition) {
auto formatInfo = getWaveFormatInfo(getBinDirectory() / "tests/resources/sine-triangle-int24-audition.wav");
EXPECT_EQ(formatInfo.frameRate, 48000);
EXPECT_EQ(formatInfo.frameCount, 480000);
EXPECT_EQ(formatInfo.channelCount, 2);
EXPECT_EQ(formatInfo.sampleFormat, SampleFormat::Int24);
EXPECT_EQ(formatInfo.bytesPerFrame, 2 * 3);
EXPECT_EQ(formatInfo.dataOffset, 92);
}
TEST(getWaveFormatInfo, int24FromFfmpeg) {
auto formatInfo = getWaveFormatInfo(getBinDirectory() / "tests/resources/sine-triangle-int24-ffmpeg.wav");
EXPECT_EQ(formatInfo.frameRate, 48000);
EXPECT_EQ(formatInfo.frameCount, 480000);
EXPECT_EQ(formatInfo.channelCount, 2);
EXPECT_EQ(formatInfo.sampleFormat, SampleFormat::Int24);
EXPECT_EQ(formatInfo.bytesPerFrame, 2 * 3);
EXPECT_EQ(formatInfo.dataOffset, 102);
}
TEST(getWaveFormatInfo, int24FromSoundforge) {
auto formatInfo = getWaveFormatInfo(getBinDirectory() / "tests/resources/sine-triangle-int24-soundforge.wav");
EXPECT_EQ(formatInfo.frameRate, 48000);
EXPECT_EQ(formatInfo.frameCount, 480000);
EXPECT_EQ(formatInfo.channelCount, 2);
EXPECT_EQ(formatInfo.sampleFormat, SampleFormat::Int24);
EXPECT_EQ(formatInfo.bytesPerFrame, 2 * 3);
EXPECT_EQ(formatInfo.dataOffset, 44);
}
TEST(getWaveFormatInfo, int32FromFfmpeg) {
auto formatInfo = getWaveFormatInfo(getBinDirectory() / "tests/resources/sine-triangle-int32-ffmpeg.wav");
EXPECT_EQ(formatInfo.frameRate, 48000);
EXPECT_EQ(formatInfo.frameCount, 480000);
EXPECT_EQ(formatInfo.channelCount, 2);
EXPECT_EQ(formatInfo.sampleFormat, SampleFormat::Int32);
EXPECT_EQ(formatInfo.bytesPerFrame, 2 * 4);
EXPECT_EQ(formatInfo.dataOffset, 102);
}
TEST(getWaveFormatInfo, int32FromSoundforge) {
auto formatInfo = getWaveFormatInfo(getBinDirectory() / "tests/resources/sine-triangle-int32-soundforge.wav");
EXPECT_EQ(formatInfo.frameRate, 48000);
EXPECT_EQ(formatInfo.frameCount, 480000);
EXPECT_EQ(formatInfo.channelCount, 2);
EXPECT_EQ(formatInfo.sampleFormat, SampleFormat::Int32);
EXPECT_EQ(formatInfo.bytesPerFrame, 2 * 4);
EXPECT_EQ(formatInfo.dataOffset, 44);
}
TEST(getWaveFormatInfo, uint8FromAudition) {
auto formatInfo = getWaveFormatInfo(getBinDirectory() / "tests/resources/sine-triangle-uint8-audition.wav");
EXPECT_EQ(formatInfo.frameRate, 48000);
EXPECT_EQ(formatInfo.frameCount, 480000);
EXPECT_EQ(formatInfo.channelCount, 2);
EXPECT_EQ(formatInfo.sampleFormat, SampleFormat::UInt8);
EXPECT_EQ(formatInfo.bytesPerFrame, 2 * 1);
EXPECT_EQ(formatInfo.dataOffset, 92);
}
TEST(getWaveFormatInfo, uint8FromFfmpeg) {
auto formatInfo = getWaveFormatInfo(getBinDirectory() / "tests/resources/sine-triangle-uint8-ffmpeg.wav");
EXPECT_EQ(formatInfo.frameRate, 48000);
EXPECT_EQ(formatInfo.frameCount, 480000);
EXPECT_EQ(formatInfo.channelCount, 2);
EXPECT_EQ(formatInfo.sampleFormat, SampleFormat::UInt8);
EXPECT_EQ(formatInfo.bytesPerFrame, 2 * 1);
EXPECT_EQ(formatInfo.dataOffset, 78);
}
TEST(getWaveFormatInfo, uint8FromSoundforge) {
auto formatInfo = getWaveFormatInfo(getBinDirectory() / "tests/resources/sine-triangle-uint8-soundforge.wav");
EXPECT_EQ(formatInfo.frameRate, 48000);
EXPECT_EQ(formatInfo.frameCount, 480000);
EXPECT_EQ(formatInfo.channelCount, 2);
EXPECT_EQ(formatInfo.sampleFormat, SampleFormat::UInt8);
EXPECT_EQ(formatInfo.bytesPerFrame, 2 * 1);
EXPECT_EQ(formatInfo.dataOffset, 44);
}