From 77588fb40e41bda85dc4763d2838fc2e148b0806 Mon Sep 17 00:00:00 2001 From: Daniel Wolf Date: Tue, 21 Sep 2021 20:42:33 +0200 Subject: [PATCH] Improve WAVE file reader to handle more formats Fixes #101 --- CHANGELOG.md | 1 + rhubarb/CMakeLists.txt | 23 ++++ rhubarb/src/audio/WaveFileReader.cpp | 86 ++++++++---- rhubarb/src/audio/WaveFileReader.h | 4 +- rhubarb/tests/WaveFileReaderTests.cpp | 186 ++++++++++++++++++++++++++ 5 files changed, 276 insertions(+), 24 deletions(-) create mode 100644 rhubarb/tests/WaveFileReaderTests.cpp diff --git a/CHANGELOG.md b/CHANGELOG.md index d13260d..ffd6281 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ## Unreleased +* **Added** support for more WAVE file features ([issue #101](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/101)) * **Changed** Rhubarb Lip Sync for Spine so that it works with any modern JRE ([issue #97](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/97)) * **Changed** Windows build from 32 bit to 64 bit ([issue #98](https://github.com/DanielSWolf/rhubarb-lip-sync/issues/98)) diff --git a/rhubarb/CMakeLists.txt b/rhubarb/CMakeLists.txt index 0e20ed3..596b303 100644 --- a/rhubarb/CMakeLists.txt +++ b/rhubarb/CMakeLists.txt @@ -520,6 +520,7 @@ set(TEST_FILES tests/tokenizationTests.cpp tests/g2pTests.cpp tests/LazyTests.cpp + tests/WaveFileReaderTests.cpp ) add_executable(runTests ${TEST_FILES}) target_link_libraries(runTests @@ -528,6 +529,7 @@ target_link_libraries(runTests gmock_main rhubarb-recognition rhubarb-time + rhubarb-audio ) # Copies the specified files in a post-build event, then installs them @@ -555,9 +557,30 @@ function(copy_and_install sourceGlob relativeTargetDirectory) endforeach() endfunction() +# Copies the specified files in a post-build event +function(copy sourceGlob relativeTargetDirectory) + # Set `sourcePaths` + file(GLOB sourcePaths "${sourceGlob}") + + foreach(sourcePath ${sourcePaths}) + if(NOT IS_DIRECTORY ${sourcePath}) + # Set `fileName` + get_filename_component(fileName "${sourcePath}" NAME) + + # Copy file during build + add_custom_command(TARGET rhubarb POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy "${sourcePath}" "$/${relativeTargetDirectory}/${fileName}" + COMMENT "Creating '${relativeTargetDirectory}/${fileName}'" + ) + endif() + endforeach() +endfunction() + copy_and_install("lib/pocketsphinx-rev13216/model/en-us/*" "res/sphinx") copy_and_install("lib/cmusphinx-en-us-5.2/*" "res/sphinx/acoustic-model") +copy_and_install("tests/resources/*" "tests/resources") + install( TARGETS rhubarb RUNTIME diff --git a/rhubarb/src/audio/WaveFileReader.cpp b/rhubarb/src/audio/WaveFileReader.cpp index 72d4b16..3cb04c5 100644 --- a/rhubarb/src/audio/WaveFileReader.cpp +++ b/rhubarb/src/audio/WaveFileReader.cpp @@ -13,22 +13,27 @@ using std::unique_ptr; using std::make_unique; using std::make_shared; using std::filesystem::path; +using std::streamoff; #define INT24_MIN (-8388608) #define INT24_MAX 8388607 // Converts an int in the range min..max to a float in the range -1..1 float toNormalizedFloat(int value, int min, int max) { - return (static_cast(value - min) / (max - min) * 2) - 1; + const float fMin = static_cast(min); + const float fMax = static_cast(max); + const float fValue = static_cast(value); + return ((fValue - fMin) / (fMax - fMin) * 2) - 1; } -int roundToEven(int i) { +streamoff roundUpToEven(streamoff i) { return (i + 1) & (~1); } namespace Codec { constexpr int Pcm = 0x01; constexpr int Float = 0x03; + constexpr int Extensible = 0xFFFE; }; string codecToString(int codec); @@ -39,11 +44,11 @@ WaveFormatInfo getWaveFormatInfo(const path& filePath) { auto file = openFile(filePath); file.seekg(0, std::ios_base::end); - std::streamoff fileSize = file.tellg(); + const streamoff fileSize = file.tellg(); file.seekg(0); auto remaining = [&](int byteCount) { - const std::streamoff filePosition = file.tellg(); + const streamoff filePosition = file.tellg(); return byteCount <= fileSize - filePosition; }; @@ -51,34 +56,46 @@ WaveFormatInfo getWaveFormatInfo(const path& filePath) { if (!remaining(10)) { throw runtime_error("WAVE file is corrupt. Header not found."); } - auto rootChunkId = read(file); + const auto rootChunkId = read(file); if (rootChunkId != fourcc('R', 'I', 'F', 'F')) { throw runtime_error("Unknown file format. Only WAVE files are supported."); } read(file); // Chunk size - uint32_t waveId = read(file); + const uint32_t waveId = read(file); if (waveId != fourcc('W', 'A', 'V', 'E')) { throw runtime_error(format("File format is not WAVE, but {}.", fourccToString(waveId))); } // Read chunks until we reach the data chunk - bool reachedDataChunk = false; - while (!reachedDataChunk && remaining(8)) { - uint32_t chunkId = read(file); - int chunkSize = read(file); + bool processedFormatChunk = false; + bool processedDataChunk = false; + while ((!processedFormatChunk || !processedDataChunk) && remaining(8)) { + const uint32_t chunkId = read(file); + const streamoff chunkSize = read(file); + const streamoff chunkEnd = roundUpToEven(file.tellg() + chunkSize); switch (chunkId) { case fourcc('f', 'm', 't', ' '): { // Read relevant data uint16_t codec = read(file); formatInfo.channelCount = read(file); - formatInfo.frameRate = read(file); + formatInfo.frameRate = read(file); read(file); // Bytes per second - int frameSize = read(file); - int bitsPerSample = read(file); - - // We've read 16 bytes so far. Skip the remainder. - file.seekg(roundToEven(chunkSize) - 16, std::ios_base::cur); + const int bytesPerFrame = read(file); + const int bitsPerSampleOnDisk = read(file); + int bitsPerSample = bitsPerSampleOnDisk; + if (chunkSize > 16) { + const int extensionSize = read(file); + if (extensionSize >= 22) { + // Read extension fields + bitsPerSample = read(file); + read(file); // Skip channel mask + const uint16_t codecOverride = read(file); + if (codec == Codec::Extensible) { + codec = codecOverride; + } + } + } // Determine sample format int bytesPerSample; @@ -96,11 +113,14 @@ WaveFormatInfo getWaveFormatInfo(const path& filePath) { } else if (bitsPerSample <= 24) { formatInfo.sampleFormat = SampleFormat::Int24; bytesPerSample = 3; + } else if (bitsPerSample <= 32) { + formatInfo.sampleFormat = SampleFormat::Int32; + bytesPerSample = 4; } else { throw runtime_error( format("Unsupported sample format: {}-bit PCM.", bitsPerSample)); } - if (bytesPerSample != frameSize / formatInfo.channelCount) { + if (bytesPerSample != bytesPerFrame / formatInfo.channelCount) { throw runtime_error("Unsupported sample organization."); } break; @@ -108,6 +128,9 @@ WaveFormatInfo getWaveFormatInfo(const path& filePath) { if (bitsPerSample == 32) { formatInfo.sampleFormat = SampleFormat::Float32; bytesPerSample = 4; + } else if (bitsPerSample == 64) { + formatInfo.sampleFormat = SampleFormat::Float64; + bytesPerSample = 8; } else { throw runtime_error( format("Unsupported sample format: {}-bit IEEE Float.", bitsPerSample) @@ -121,24 +144,30 @@ WaveFormatInfo getWaveFormatInfo(const path& filePath) { )); } formatInfo.bytesPerFrame = bytesPerSample * formatInfo.channelCount; + processedFormatChunk = true; break; } case fourcc('d', 'a', 't', 'a'): { - reachedDataChunk = true; formatInfo.dataOffset = file.tellg(); formatInfo.frameCount = chunkSize / formatInfo.bytesPerFrame; + processedDataChunk = true; break; } default: { - // Skip unknown chunk - file.seekg(roundToEven(chunkSize), std::ios_base::cur); + // Ignore unknown chunk break; } } + + // Seek to end of chunk + file.seekg(chunkEnd, std::ios_base::beg); } + if (!processedFormatChunk) throw runtime_error("Missing format chunk."); + if (!processedDataChunk) throw runtime_error("Missing data chunk."); + return formatInfo; } @@ -177,11 +206,22 @@ inline AudioClip::value_type readSample( sum += toNormalizedFloat(raw, INT24_MIN, INT24_MAX); break; } + case SampleFormat::Int32: + { + const int32_t raw = read(file); + sum += toNormalizedFloat(raw, INT32_MIN, INT32_MAX); + break; + } case SampleFormat::Float32: { sum += read(file); break; } + case SampleFormat::Float64: + { + sum += static_cast(read(file)); + break; + } } } @@ -196,13 +236,13 @@ SampleReader WaveFileReader::createUnsafeSampleReader() const { filePos = std::streampos(0) ](size_type index) mutable { const std::streampos newFilePos = formatInfo.dataOffset - + static_cast(index * formatInfo.bytesPerFrame); + + static_cast(index * formatInfo.bytesPerFrame); if (newFilePos != filePos) { file->seekg(newFilePos); } const value_type result = readSample(*file, formatInfo.sampleFormat, formatInfo.channelCount); - filePos = newFilePos + static_cast(formatInfo.bytesPerFrame); + filePos = newFilePos + static_cast(formatInfo.bytesPerFrame); return result; }; } @@ -454,4 +494,4 @@ string codecToString(int codec) { default: return format("{0:#x}", codec); } -} \ No newline at end of file +} diff --git a/rhubarb/src/audio/WaveFileReader.h b/rhubarb/src/audio/WaveFileReader.h index c128b0f..0680844 100644 --- a/rhubarb/src/audio/WaveFileReader.h +++ b/rhubarb/src/audio/WaveFileReader.h @@ -7,7 +7,9 @@ enum class SampleFormat { UInt8, Int16, Int24, - Float32 + Int32, + Float32, + Float64 }; struct WaveFormatInfo { diff --git a/rhubarb/tests/WaveFileReaderTests.cpp b/rhubarb/tests/WaveFileReaderTests.cpp new file mode 100644 index 0000000..c73e236 --- /dev/null +++ b/rhubarb/tests/WaveFileReaderTests.cpp @@ -0,0 +1,186 @@ +#include +#include "audio/WaveFileReader.h" +#include "tools/platformTools.h" + +using namespace testing; + +TEST(getWaveFormatInfo, float32FromAudacity) { + auto formatInfo = getWaveFormatInfo(getBinDirectory() / "tests/resources/sine-triangle-float32-audacity.wav"); + EXPECT_EQ(formatInfo.frameRate, 48000); + EXPECT_EQ(formatInfo.frameCount, 480000); + EXPECT_EQ(formatInfo.channelCount, 2); + EXPECT_EQ(formatInfo.sampleFormat, SampleFormat::Float32); + EXPECT_EQ(formatInfo.bytesPerFrame, 2 * 4); + EXPECT_EQ(formatInfo.dataOffset, 88); +} + +TEST(getWaveFormatInfo, float32FromAudition) { + auto formatInfo = getWaveFormatInfo(getBinDirectory() / "tests/resources/sine-triangle-float32-audition.wav"); + EXPECT_EQ(formatInfo.frameRate, 48000); + EXPECT_EQ(formatInfo.frameCount, 480000); + EXPECT_EQ(formatInfo.channelCount, 2); + EXPECT_EQ(formatInfo.sampleFormat, SampleFormat::Float32); + EXPECT_EQ(formatInfo.bytesPerFrame, 2 * 4); + EXPECT_EQ(formatInfo.dataOffset, 92); +} + +TEST(getWaveFormatInfo, float32FromFfmpeg) { + auto formatInfo = getWaveFormatInfo(getBinDirectory() / "tests/resources/sine-triangle-float32-ffmpeg.wav"); + EXPECT_EQ(formatInfo.frameRate, 48000); + EXPECT_EQ(formatInfo.frameCount, 480000); + EXPECT_EQ(formatInfo.channelCount, 2); + EXPECT_EQ(formatInfo.sampleFormat, SampleFormat::Float32); + EXPECT_EQ(formatInfo.bytesPerFrame, 2 * 4); + EXPECT_EQ(formatInfo.dataOffset, 114); +} + +TEST(getWaveFormatInfo, float32FromSoundforge) { + auto formatInfo = getWaveFormatInfo(getBinDirectory() / "tests/resources/sine-triangle-float32-soundforge.wav"); + EXPECT_EQ(formatInfo.frameRate, 48000); + EXPECT_EQ(formatInfo.frameCount, 480000); + EXPECT_EQ(formatInfo.channelCount, 2); + EXPECT_EQ(formatInfo.sampleFormat, SampleFormat::Float32); + EXPECT_EQ(formatInfo.bytesPerFrame, 2 * 4); + EXPECT_EQ(formatInfo.dataOffset, 44); +} + +TEST(getWaveFormatInfo, float64FromFfmpeg) { + auto formatInfo = getWaveFormatInfo(getBinDirectory() / "tests/resources/sine-triangle-float64-ffmpeg.wav"); + EXPECT_EQ(formatInfo.frameRate, 48000); + EXPECT_EQ(formatInfo.frameCount, 480000); + EXPECT_EQ(formatInfo.channelCount, 2); + EXPECT_EQ(formatInfo.sampleFormat, SampleFormat::Float64); + EXPECT_EQ(formatInfo.bytesPerFrame, 2 * 8); + EXPECT_EQ(formatInfo.dataOffset, 114); +} + +TEST(getWaveFormatInfo, int16FromAudacity) { + auto formatInfo = getWaveFormatInfo(getBinDirectory() / "tests/resources/sine-triangle-int16-audacity.wav"); + EXPECT_EQ(formatInfo.frameRate, 48000); + EXPECT_EQ(formatInfo.frameCount, 480000); + EXPECT_EQ(formatInfo.channelCount, 2); + EXPECT_EQ(formatInfo.sampleFormat, SampleFormat::Int16); + EXPECT_EQ(formatInfo.bytesPerFrame, 2 * 2); + EXPECT_EQ(formatInfo.dataOffset, 44); +} + +TEST(getWaveFormatInfo, int16FromAudition) { + auto formatInfo = getWaveFormatInfo(getBinDirectory() / "tests/resources/sine-triangle-int16-audition.wav"); + EXPECT_EQ(formatInfo.frameRate, 48000); + EXPECT_EQ(formatInfo.frameCount, 480000); + EXPECT_EQ(formatInfo.channelCount, 2); + EXPECT_EQ(formatInfo.sampleFormat, SampleFormat::Int16); + EXPECT_EQ(formatInfo.bytesPerFrame, 2 * 2); + EXPECT_EQ(formatInfo.dataOffset, 92); +} + +TEST(getWaveFormatInfo, int16FromFfmpeg) { + auto formatInfo = getWaveFormatInfo(getBinDirectory() / "tests/resources/sine-triangle-int16-ffmpeg.wav"); + EXPECT_EQ(formatInfo.frameRate, 48000); + EXPECT_EQ(formatInfo.frameCount, 480000); + EXPECT_EQ(formatInfo.channelCount, 2); + EXPECT_EQ(formatInfo.sampleFormat, SampleFormat::Int16); + EXPECT_EQ(formatInfo.bytesPerFrame, 2 * 2); + EXPECT_EQ(formatInfo.dataOffset, 78); +} + +TEST(getWaveFormatInfo, int16FromSoundforge) { + auto formatInfo = getWaveFormatInfo(getBinDirectory() / "tests/resources/sine-triangle-int16-soundforge.wav"); + EXPECT_EQ(formatInfo.frameRate, 48000); + EXPECT_EQ(formatInfo.frameCount, 480000); + EXPECT_EQ(formatInfo.channelCount, 2); + EXPECT_EQ(formatInfo.sampleFormat, SampleFormat::Int16); + EXPECT_EQ(formatInfo.bytesPerFrame, 2 * 2); + EXPECT_EQ(formatInfo.dataOffset, 44); +} + +TEST(getWaveFormatInfo, int24FromAudacity) { + auto formatInfo = getWaveFormatInfo(getBinDirectory() / "tests/resources/sine-triangle-int24-audacity.wav"); + EXPECT_EQ(formatInfo.frameRate, 48000); + EXPECT_EQ(formatInfo.frameCount, 480000); + EXPECT_EQ(formatInfo.channelCount, 2); + EXPECT_EQ(formatInfo.sampleFormat, SampleFormat::Int24); + EXPECT_EQ(formatInfo.bytesPerFrame, 2 * 3); + EXPECT_EQ(formatInfo.dataOffset, 44); +} + +TEST(getWaveFormatInfo, int24FromAudition) { + auto formatInfo = getWaveFormatInfo(getBinDirectory() / "tests/resources/sine-triangle-int24-audition.wav"); + EXPECT_EQ(formatInfo.frameRate, 48000); + EXPECT_EQ(formatInfo.frameCount, 480000); + EXPECT_EQ(formatInfo.channelCount, 2); + EXPECT_EQ(formatInfo.sampleFormat, SampleFormat::Int24); + EXPECT_EQ(formatInfo.bytesPerFrame, 2 * 3); + EXPECT_EQ(formatInfo.dataOffset, 92); +} + +TEST(getWaveFormatInfo, int24FromFfmpeg) { + auto formatInfo = getWaveFormatInfo(getBinDirectory() / "tests/resources/sine-triangle-int24-ffmpeg.wav"); + EXPECT_EQ(formatInfo.frameRate, 48000); + EXPECT_EQ(formatInfo.frameCount, 480000); + EXPECT_EQ(formatInfo.channelCount, 2); + EXPECT_EQ(formatInfo.sampleFormat, SampleFormat::Int24); + EXPECT_EQ(formatInfo.bytesPerFrame, 2 * 3); + EXPECT_EQ(formatInfo.dataOffset, 102); +} + +TEST(getWaveFormatInfo, int24FromSoundforge) { + auto formatInfo = getWaveFormatInfo(getBinDirectory() / "tests/resources/sine-triangle-int24-soundforge.wav"); + EXPECT_EQ(formatInfo.frameRate, 48000); + EXPECT_EQ(formatInfo.frameCount, 480000); + EXPECT_EQ(formatInfo.channelCount, 2); + EXPECT_EQ(formatInfo.sampleFormat, SampleFormat::Int24); + EXPECT_EQ(formatInfo.bytesPerFrame, 2 * 3); + EXPECT_EQ(formatInfo.dataOffset, 44); +} + +TEST(getWaveFormatInfo, int32FromFfmpeg) { + auto formatInfo = getWaveFormatInfo(getBinDirectory() / "tests/resources/sine-triangle-int32-ffmpeg.wav"); + EXPECT_EQ(formatInfo.frameRate, 48000); + EXPECT_EQ(formatInfo.frameCount, 480000); + EXPECT_EQ(formatInfo.channelCount, 2); + EXPECT_EQ(formatInfo.sampleFormat, SampleFormat::Int32); + EXPECT_EQ(formatInfo.bytesPerFrame, 2 * 4); + EXPECT_EQ(formatInfo.dataOffset, 102); +} + +TEST(getWaveFormatInfo, int32FromSoundforge) { + auto formatInfo = getWaveFormatInfo(getBinDirectory() / "tests/resources/sine-triangle-int32-soundforge.wav"); + EXPECT_EQ(formatInfo.frameRate, 48000); + EXPECT_EQ(formatInfo.frameCount, 480000); + EXPECT_EQ(formatInfo.channelCount, 2); + EXPECT_EQ(formatInfo.sampleFormat, SampleFormat::Int32); + EXPECT_EQ(formatInfo.bytesPerFrame, 2 * 4); + EXPECT_EQ(formatInfo.dataOffset, 44); +} + +TEST(getWaveFormatInfo, uint8FromAudition) { + auto formatInfo = getWaveFormatInfo(getBinDirectory() / "tests/resources/sine-triangle-uint8-audition.wav"); + EXPECT_EQ(formatInfo.frameRate, 48000); + EXPECT_EQ(formatInfo.frameCount, 480000); + EXPECT_EQ(formatInfo.channelCount, 2); + EXPECT_EQ(formatInfo.sampleFormat, SampleFormat::UInt8); + EXPECT_EQ(formatInfo.bytesPerFrame, 2 * 1); + EXPECT_EQ(formatInfo.dataOffset, 92); +} + +TEST(getWaveFormatInfo, uint8FromFfmpeg) { + auto formatInfo = getWaveFormatInfo(getBinDirectory() / "tests/resources/sine-triangle-uint8-ffmpeg.wav"); + EXPECT_EQ(formatInfo.frameRate, 48000); + EXPECT_EQ(formatInfo.frameCount, 480000); + EXPECT_EQ(formatInfo.channelCount, 2); + EXPECT_EQ(formatInfo.sampleFormat, SampleFormat::UInt8); + EXPECT_EQ(formatInfo.bytesPerFrame, 2 * 1); + EXPECT_EQ(formatInfo.dataOffset, 78); +} + +TEST(getWaveFormatInfo, uint8FromSoundforge) { + auto formatInfo = getWaveFormatInfo(getBinDirectory() / "tests/resources/sine-triangle-uint8-soundforge.wav"); + EXPECT_EQ(formatInfo.frameRate, 48000); + EXPECT_EQ(formatInfo.frameCount, 480000); + EXPECT_EQ(formatInfo.channelCount, 2); + EXPECT_EQ(formatInfo.sampleFormat, SampleFormat::UInt8); + EXPECT_EQ(formatInfo.bytesPerFrame, 2 * 1); + EXPECT_EQ(formatInfo.dataOffset, 44); +} +