From ea4426085123ce13c5400020f9315231ed06317c Mon Sep 17 00:00:00 2001 From: srishti Date: Thu, 11 Dec 2025 11:34:29 +0530 Subject: [PATCH 1/4] feat: Added libfvad library and integrated Voice Activity Detection added unit test for VAD --- base/include/VADTransform.h | 64 +++++++++ base/src/VADTransform.cpp | 254 +++++++++++++++++++++++++++++++++++ base/test/vad_tests.cpp | 85 ++++++++++++ base/test/view_vad_output.py | 71 ++++++++++ 4 files changed, 474 insertions(+) create mode 100644 base/include/VADTransform.h create mode 100644 base/src/VADTransform.cpp create mode 100644 base/test/vad_tests.cpp create mode 100644 base/test/view_vad_output.py diff --git a/base/include/VADTransform.h b/base/include/VADTransform.h new file mode 100644 index 000000000..1b5de839a --- /dev/null +++ b/base/include/VADTransform.h @@ -0,0 +1,64 @@ +#pragma once + +#include "Module.h" + +class VADTransformProps : public ModuleProps +{ +public: + enum AggressivenessMode { + QUALITY = 0, // Least aggressive (best quality, catches more speech) + LOW_BITRATE = 1, // Moderate + AGGRESSIVE = 2, // More aggressive + VERY_AGGRESSIVE = 3 // Most aggressive (best bandwidth saving, only clear speech) + }; + + enum FrameLength { + FRAME_10MS = 10, + FRAME_20MS = 20, + FRAME_30MS = 30 + }; + + VADTransformProps( + int _sampleRate = 16000, + AggressivenessMode _mode = QUALITY, + FrameLength _frameLength = FRAME_10MS + ); + + int sampleRate; + AggressivenessMode mode; + FrameLength frameLength; + + size_t getSerializeSize(); + +private: + friend class boost::serialization::access; + + template + void serialize(Archive& ar, const unsigned int version); +}; + +class VADTransform : public Module +{ +public: + VADTransform(VADTransformProps _props); + virtual ~VADTransform(); + + bool init(); + bool term(); + void setProps(VADTransformProps& props); + VADTransformProps getProps(); + +protected: + bool process(frame_container& frames); + bool processSOS(frame_sp& frame); + bool validateInputPins(); + bool validateOutputPins(); + void addInputPin(framemetadata_sp& metadata, string& pinId); + bool handlePropsChange(frame_sp& frame); + bool processEOS(string& pinId); + +private: + void setMetadata(framemetadata_sp& metadata); + class Detail; + boost::shared_ptr mDetail; +}; diff --git a/base/src/VADTransform.cpp b/base/src/VADTransform.cpp new file mode 100644 index 000000000..9396b66f6 --- /dev/null +++ b/base/src/VADTransform.cpp @@ -0,0 +1,254 @@ +#include "VADTransform.h" +#include "FrameMetadata.h" +#include "FrameMetadataFactory.h" +#include "Frame.h" +#include "Logger.h" +#include "Utils.h" +#include "fvad.h" + +// VADTransformProps implementation +VADTransformProps::VADTransformProps( + int _sampleRate, + AggressivenessMode _mode, + FrameLength _frameLength +) : sampleRate(_sampleRate), + mode(_mode), + frameLength(_frameLength) +{} + +size_t VADTransformProps::getSerializeSize() { + return ModuleProps::getSerializeSize() + + sizeof(sampleRate) + + sizeof(mode) + + sizeof(frameLength); +} + +template +void VADTransformProps::serialize(Archive& ar, const unsigned int version) { + ar& boost::serialization::base_object(*this); + ar& sampleRate; + ar& mode; + ar& frameLength; +} + +// Detail class - holds libfvad instance +class VADTransform::Detail +{ +public: + Detail(VADTransformProps& _props) : mProps(_props), mVad(nullptr) + { + } + + ~Detail() + { + if (mVad) { + fvad_free(mVad); + mVad = nullptr; + } + } + + bool init() + { + // Create libfvad instance + mVad = fvad_new(); + if (!mVad) { + LOG_ERROR << "Failed to create libfvad instance"; + return false; + } + + // Set sample rate (must be 8000, 16000, 32000, or 48000) + if (fvad_set_sample_rate(mVad, mProps.sampleRate) < 0) { + LOG_ERROR << "Invalid sample rate: " << mProps.sampleRate; + LOG_ERROR << "Valid rates are: 8000, 16000, 32000, 48000"; + return false; + } + + // Set aggressiveness mode (0-3) + if (fvad_set_mode(mVad, mProps.mode) < 0) { + LOG_ERROR << "Invalid aggressiveness mode: " << mProps.mode; + return false; + } + + LOG_INFO << "=== VAD Configuration ==="; + LOG_INFO << "Sample Rate: " << mProps.sampleRate << " Hz"; + LOG_INFO << "Aggressiveness Mode: " << mProps.mode << " (0=Quality, 3=Very Aggressive)"; + LOG_INFO << "Frame Length: " << mProps.frameLength << " ms"; + LOG_INFO << "Expected samples per frame: " << (mProps.sampleRate * mProps.frameLength / 1000); + LOG_INFO << "========================"; + + mFrameCount = 0; // Reset frame counter + + return true; + } + + int processAudio(const int16_t* samples, size_t length) + { + if (!mVad) { + LOG_ERROR << "VAD not initialized"; + return -1; + } + // Returns: 1 = speech, 0 = silence, -1 = error + int result = fvad_process(mVad, samples, length); + + if (result < 0) { + LOG_ERROR << "fvad_process failed. Check frame length matches expected size."; + LOG_ERROR << "Expected samples for " << mProps.frameLength << "ms at " + << mProps.sampleRate << "Hz: " + << (mProps.sampleRate * mProps.frameLength / 1000); + LOG_ERROR << "Actual samples received: " << length; + } + + mFrameCount++; + + return result; + } + + void setProps(VADTransformProps& props) + { + mProps = props; + } + +public: + framemetadata_sp mOutputMetadata; + std::string mOutputPinId; + VADTransformProps mProps; + +private: + Fvad* mVad; + size_t mFrameCount; +}; + +// VADTransform implementation +VADTransform::VADTransform(VADTransformProps _props) + : Module(TRANSFORM, "VADTransform", _props) +{ + mDetail.reset(new Detail(_props)); +} + +VADTransform::~VADTransform() {} + +bool VADTransform::validateInputPins() +{ + if (getNumberOfInputPins() != 1) { + LOG_ERROR << "<" << getId() << ">::validateInputPins expects 1 input. Actual: " << getNumberOfInputPins(); + return false; + } + + framemetadata_sp metadata = getFirstInputMetadata(); + FrameMetadata::FrameType frameType = metadata->getFrameType(); + + if (frameType != FrameMetadata::AUDIO) { + LOG_ERROR << "<" << getId() << ">::validateInputPins expects AUDIO input. Actual: " << frameType; + return false; + } + + FrameMetadata::MemType memType = metadata->getMemType(); + if (memType != FrameMetadata::MemType::HOST) { + LOG_ERROR << "<" << getId() << ">::validateInputPins expects HOST memory. Actual: " << memType; + return false; + } + + return true; +} + +bool VADTransform::validateOutputPins() +{ + if (getNumberOfOutputPins() != 1) { + LOG_ERROR << "<" << getId() << ">::validateOutputPins expects 1 output. Actual: " << getNumberOfOutputPins(); + return false; + } + + framemetadata_sp metadata = getFirstOutputMetadata(); + FrameMetadata::FrameType frameType = metadata->getFrameType(); + + if (frameType != FrameMetadata::GENERAL) { + LOG_ERROR << "<" << getId() << ">::validateOutputPins expects GENERAL output. Actual: " << frameType; + return false; + } + + return true; +} + +void VADTransform::addInputPin(framemetadata_sp& metadata, string& pinId) +{ + Module::addInputPin(metadata, pinId); + mDetail->mOutputMetadata = framemetadata_sp(new FrameMetadata(FrameMetadata::FrameType::GENERAL)); + mDetail->mOutputMetadata->copyHint(*metadata.get()); + mDetail->mOutputPinId = addOutputPin(mDetail->mOutputMetadata); +} + +bool VADTransform::init() +{ + if (!Module::init()) { + return false; + } + + return mDetail->init(); +} + +bool VADTransform::term() +{ + return Module::term(); +} + +bool VADTransform::process(frame_container& frames) +{ + // 1. Get input audio frame + auto frame = frames.begin()->second; + int16_t* samples = static_cast(frame->data()); + size_t sampleCount = frame->size() / 2; // Int16 = 2 bytes + + // 2. Call libfvad + int result = mDetail->processAudio(samples, sampleCount); + + // 3. Create output frame with VAD result + auto outFrame = makeFrame(sizeof(int)); + int vadResult = (result == 1) ? 1 : 0; + memcpy(outFrame->data(), &vadResult, sizeof(int)); + + // 4. Send output + frames.insert(make_pair(mDetail->mOutputPinId, outFrame)); + send(frames); + + return true; +} + +bool VADTransform::processSOS(frame_sp& frame) +{ + auto metadata = frame->getMetadata(); + setMetadata(metadata); + return true; +} + +void VADTransform::setMetadata(framemetadata_sp& metadata) +{ + if (!metadata->isSet()) { + return; + } +} + +VADTransformProps VADTransform::getProps() +{ + fillProps(mDetail->mProps); + return mDetail->mProps; +} + +void VADTransform::setProps(VADTransformProps& props) +{ + Module::addPropsToQueue(props); +} + +bool VADTransform::handlePropsChange(frame_sp& frame) +{ + auto ret = Module::handlePropsChange(frame, mDetail->mProps); + mDetail->setProps(mDetail->mProps); + + mDetail->init(); + + return ret; +} + +bool VADTransform::processEOS(string& pinId) +{ + return true; +} diff --git a/base/test/vad_tests.cpp b/base/test/vad_tests.cpp new file mode 100644 index 000000000..f10134800 --- /dev/null +++ b/base/test/vad_tests.cpp @@ -0,0 +1,85 @@ +#include "stdafx.h" +#include +#include +#include "VADTransform.h" +#include "AudioCaptureSrc.h" +#include "FileWriterModule.h" +#include "PipeLine.h" +#include "Logger.h" +#include "test_utils.h" + +BOOST_AUTO_TEST_SUITE(vad_tests) + +BOOST_AUTO_TEST_CASE(vad_basic_test, *boost::unit_test::disabled()) +{ + Logger::setLogLevel(boost::log::trivial::severity_level::info); + LOG_INFO << "Starting VAD Basic Test..."; + + int sampleRate = 16000; + int channels = 1; + int processingInterval = 10; + + AudioCaptureSrcProps audioProps(sampleRate, channels, 0, processingInterval); + auto audioSrc = boost::shared_ptr(new AudioCaptureSrc(audioProps)); + + VADTransformProps vadProps( + 16000, + VADTransformProps::QUALITY, + VADTransformProps::FRAME_10MS + ); + auto vad = boost::shared_ptr(new VADTransform(vadProps)); + + std::string filename = "./data/vad_output.raw"; + FileWriterModuleProps sinkProps(filename, true); + auto sink = boost::shared_ptr(new FileWriterModule(sinkProps)); + + audioSrc->setNext(vad); + vad->setNext(sink); + + PipeLine p("VADTestPipeline"); + p.appendModule(audioSrc); + + LOG_INFO << "Initializing Pipeline..."; + BOOST_TEST(p.init()); + + LOG_INFO << "Running for 10 seconds..."; + LOG_INFO << "First 5 seconds: STAY SILENT (expect mostly 0s)"; + LOG_INFO << "Last 5 seconds: SPEAK INTO MIC (expect mostly 1s)"; + + p.run_all_threaded(); + boost::this_thread::sleep_for(boost::chrono::seconds(10)); + + LOG_INFO << "Stopping Pipeline..."; + p.stop(); + p.term(); + p.wait_for_all(); +} + +BOOST_AUTO_TEST_CASE(vad_aggressiveness_test, *boost::unit_test::disabled()) +{ + Logger::setLogLevel(boost::log::trivial::severity_level::info); + LOG_INFO << "Testing different aggressiveness modes..."; + + // Test all 4 aggressiveness modes + VADTransformProps::AggressivenessMode modes[] = { + VADTransformProps::QUALITY, + VADTransformProps::LOW_BITRATE, + VADTransformProps::AGGRESSIVE, + VADTransformProps::VERY_AGGRESSIVE + }; + + for (int i = 0; i < 4; i++) { + LOG_INFO << "Testing mode " << i << "..."; + + VADTransformProps props(16000, modes[i], VADTransformProps::FRAME_10MS); + auto vad = boost::shared_ptr(new VADTransform(props)); + + // Just verify it initializes + BOOST_TEST(vad->init()); + vad->term(); + } + + LOG_INFO << "All aggressiveness modes initialized successfully"; +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/base/test/view_vad_output.py b/base/test/view_vad_output.py new file mode 100644 index 000000000..84f53acd3 --- /dev/null +++ b/base/test/view_vad_output.py @@ -0,0 +1,71 @@ +import struct +import sys +import os + +def visualize_vad(filename): + if not os.path.exists(filename): + print(f"Error: File '{filename}' not found.") + return + + print(f"Reading VAD output from: {filename}") + + with open(filename, 'rb') as f: + data = f.read() + + + num_samples = len(data) // 4 + print(f"Total Frames: {num_samples}") + + valid_bytes_len = num_samples * 4 + if len(data) != valid_bytes_len: + print(f"Warning: File size {len(data)} is not a multiple of 4. Truncating {len(data) - valid_bytes_len} bytes.") + data = data[:valid_bytes_len] + + + values = struct.unpack(f'<{num_samples}i', data) + + + zeros = values.count(0) + ones = values.count(1) + + print("-" * 40) + print(f"Silence Frames (0): {zeros}") + print(f"Speech Frames (1): {ones}") + if num_samples > 0: + print(f"Speech Activity: {(ones / num_samples) * 100:.2f}%") + print("-" * 40) + + chunk_size = 10 + print(f"\nTimeline (each char = {chunk_size} frames = {chunk_size*10}ms):") + print("Legend: '_' = Silence, '#' = Speech, '.' = Mixed") + + timeline = "" + for i in range(0, num_samples, chunk_size): + chunk = values[i:i+chunk_size] + chunk_sum = sum(chunk) + + if chunk_sum == 0: + timeline += "_" + elif chunk_sum == len(chunk): + timeline += "#" + else: + timeline += "." + + if len(timeline) >= 64: + print(timeline) + timeline = "" + + if timeline: + print(timeline) + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: python view_vad.py ") + print("Example: python view_vad.py vad_output.raw") + + default_path = "data/vad_output.raw" + if os.path.exists(default_path): + print(f"\nNo file specified. Found default: {default_path}") + visualize_vad(default_path) + else: + visualize_vad(sys.argv[1]) From 240d44379773bd0bced877f9a6026da06753eafb Mon Sep 17 00:00:00 2001 From: srishti Date: Thu, 11 Dec 2025 11:53:21 +0530 Subject: [PATCH 2/4] changes in CMakelist --- base/CMakeLists.txt | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/base/CMakeLists.txt b/base/CMakeLists.txt index 1c964a2b7..022bb94ec 100755 --- a/base/CMakeLists.txt +++ b/base/CMakeLists.txt @@ -165,6 +165,9 @@ ENDIF(ENABLE_CUDA) include_directories(AFTER SYSTEM include) +add_subdirectory(../thirdparty/libfvad libfvad) +include_directories(AFTER SYSTEM ../thirdparty/libfvad/include) + # ApraPipes library SET(CORE_FILES @@ -339,6 +342,7 @@ SET(IP_FILES src/OverlayFactory.cpp src/TestSignalGeneratorSrc.cpp src/AudioToTextXForm.cpp + src/VADTransform.cpp src/AbsControlModule.cpp src/ThumbnailListGenerator.cpp ) @@ -366,6 +370,7 @@ SET(IP_FILES_H include/ColorConversionXForm.h include/Overlay.h include/AudioToTextXForm.h + include/VADTransform.h include/ThumbnailListGenerator.h ) @@ -627,6 +632,7 @@ SET(UT_FILES test/mp4_dts_strategy_tests.cpp test/overlaymodule_tests.cpp test/testSignalGeneratorSrc_tests.cpp + test/vad_tests.cpp test/audioToTextXform_tests.cpp test/simpleControlModuleTests.cpp ${ARM64_UT_FILES} @@ -692,6 +698,7 @@ target_link_libraries(aprapipesut bigint::bigint sfml-audio whisper::whisper + fvad ) IF(ENABLE_WINDOWS) From 2f60f6bb6a7dc03649452294f9e1a6630a2f232e Mon Sep 17 00:00:00 2001 From: Srishti Karanth Date: Thu, 18 Dec 2025 16:42:52 +0530 Subject: [PATCH 3/4] worked on pr comments: 1. renamed vad ->voice activity detector 2. bringing fvad from vcpkg 3 . made changes in unit test 4. sending input frame also , along with Voice Detection info --- base/CMakeLists.txt | 11 +- base/include/VoiceActivityDetector.h | 69 +++++ base/src/VoiceActivityDetector.cpp | 255 ++++++++++++++++++ base/test/VoiceActivityDetectorTests.cpp | 191 +++++++++++++ .../view_voice_activity_detector_output.py | 71 +++++ base/vcpkg.json | 13 +- .../custom-overlay/libfvad/portfile.cmake | 17 ++ thirdparty/custom-overlay/libfvad/vcpkg.json | 16 ++ 8 files changed, 629 insertions(+), 14 deletions(-) create mode 100644 base/include/VoiceActivityDetector.h create mode 100644 base/src/VoiceActivityDetector.cpp create mode 100644 base/test/VoiceActivityDetectorTests.cpp create mode 100644 base/test/view_voice_activity_detector_output.py create mode 100644 thirdparty/custom-overlay/libfvad/portfile.cmake create mode 100644 thirdparty/custom-overlay/libfvad/vcpkg.json diff --git a/base/CMakeLists.txt b/base/CMakeLists.txt index 022bb94ec..c0df3fb19 100755 --- a/base/CMakeLists.txt +++ b/base/CMakeLists.txt @@ -50,6 +50,8 @@ find_package(ZXing CONFIG REQUIRED) find_package(bigint CONFIG REQUIRED) find_package(SFML COMPONENTS system window audio graphics CONFIG REQUIRED) find_package(whisper CONFIG REQUIRED) +find_path(FVAD_INCLUDE_DIR fvad.h) +find_library(FVAD_LIBRARY NAMES fvad libfvad) IF(ENABLE_LINUX) @@ -342,7 +344,7 @@ SET(IP_FILES src/OverlayFactory.cpp src/TestSignalGeneratorSrc.cpp src/AudioToTextXForm.cpp - src/VADTransform.cpp + src/VoiceActivityDetector.cpp src/AbsControlModule.cpp src/ThumbnailListGenerator.cpp ) @@ -370,7 +372,7 @@ SET(IP_FILES_H include/ColorConversionXForm.h include/Overlay.h include/AudioToTextXForm.h - include/VADTransform.h + include/VoiceActivityDetector.h include/ThumbnailListGenerator.h ) @@ -528,6 +530,7 @@ target_include_directories ( aprapipes PRIVATE ${BARESIP_INC_DIR} ${LIBRE_INC_DIR} ${NVCODEC_INCLUDE_DIR} + ${FVAD_INCLUDE_DIR} ) @@ -632,7 +635,7 @@ SET(UT_FILES test/mp4_dts_strategy_tests.cpp test/overlaymodule_tests.cpp test/testSignalGeneratorSrc_tests.cpp - test/vad_tests.cpp + test/VoiceActivityDetectorTests.cpp test/audioToTextXform_tests.cpp test/simpleControlModuleTests.cpp ${ARM64_UT_FILES} @@ -698,7 +701,7 @@ target_link_libraries(aprapipesut bigint::bigint sfml-audio whisper::whisper - fvad + ${FVAD_LIBRARY} ) IF(ENABLE_WINDOWS) diff --git a/base/include/VoiceActivityDetector.h b/base/include/VoiceActivityDetector.h new file mode 100644 index 000000000..a64a71463 --- /dev/null +++ b/base/include/VoiceActivityDetector.h @@ -0,0 +1,69 @@ +#pragma once + +#include "Module.h" + +class VoiceActivityDetectorProps : public ModuleProps +{ +public: + + //decides speech vs no-speech + enum AggressivenessMode { + QUALITY = 0, // Least aggressive (best quality, catches more speech) + LOW_BITRATE = 1, // Moderate + AGGRESSIVE = 2, // More aggressive + VERY_AGGRESSIVE = 3 // Most aggressive (best bandwidth saving, only clear speech) + }; + + // audio time the VAD analyzes at once + // default sample rate is 16 kHz (16,000 samples/sec) + // 10ms - 160 samples 20ms - 320 samples 30ms - 480 samples + enum FrameLength { + FRAME_10MS = 10, // Lowest latency + FRAME_20MS = 20, // Balanced + FRAME_30MS = 30 // Best accuracy + }; + + VoiceActivityDetectorProps( + int _sampleRate = 16000, + AggressivenessMode _mode = QUALITY, + FrameLength _frameLength = FRAME_10MS + ); + + int sampleRate; + AggressivenessMode mode; + FrameLength frameLength; + + size_t getSerializeSize(); + +private: + friend class boost::serialization::access; + + template + void serialize(Archive& ar, const unsigned int version); +}; + +class VoiceActivityDetector : public Module +{ +public: + VoiceActivityDetector(VoiceActivityDetectorProps _props); + virtual ~VoiceActivityDetector(); + + bool init(); + bool term(); + void setProps(VoiceActivityDetectorProps& props); + VoiceActivityDetectorProps getProps(); + +protected: + bool process(frame_container& frames); + bool processSOS(frame_sp& frame); + bool validateInputPins(); + bool validateOutputPins(); + void addInputPin(framemetadata_sp& metadata, string& pinId); + bool handlePropsChange(frame_sp& frame); + bool processEOS(string& pinId); + +private: + void setMetadata(framemetadata_sp& metadata); + class Detail; + boost::shared_ptr mDetail; +}; diff --git a/base/src/VoiceActivityDetector.cpp b/base/src/VoiceActivityDetector.cpp new file mode 100644 index 000000000..06b0f6d7d --- /dev/null +++ b/base/src/VoiceActivityDetector.cpp @@ -0,0 +1,255 @@ +#include "VoiceActivityDetector.h" +#include "FrameMetadata.h" +#include "FrameMetadataFactory.h" +#include "Frame.h" +#include "Logger.h" +#include "Utils.h" +#include "fvad.h" + +// VoiceActivityDetectorProps implementation +VoiceActivityDetectorProps::VoiceActivityDetectorProps( + int _sampleRate, + AggressivenessMode _mode, + FrameLength _frameLength +) : sampleRate(_sampleRate), + mode(_mode), + frameLength(_frameLength) +{} + +size_t VoiceActivityDetectorProps::getSerializeSize() { + return ModuleProps::getSerializeSize() + + sizeof(sampleRate) + + sizeof(mode) + + sizeof(frameLength); +} + +template +void VoiceActivityDetectorProps::serialize(Archive& ar, const unsigned int version) { + ar& boost::serialization::base_object(*this); + ar& sampleRate; + ar& mode; + ar& frameLength; +} + +class VoiceActivityDetector::Detail +{ +public: + Detail(VoiceActivityDetectorProps& _props) : mProps(_props), mVoiceDetector(nullptr) + { + } + + ~Detail() + { + if (mVoiceDetector) { + fvad_free(mVoiceDetector); + mVoiceDetector = nullptr; + } + } + + bool init() + { + // Create libfvad instance + mVoiceDetector = fvad_new(); + if (!mVoiceDetector) { + LOG_ERROR << "Failed to create libfvad instance"; + return false; + } + + // Set sample rate (must be 8000, 16000, 32000, or 48000) + if (fvad_set_sample_rate(mVoiceDetector, mProps.sampleRate) < 0) { + LOG_ERROR << "Invalid sample rate: " << mProps.sampleRate; + LOG_ERROR << "Valid rates are: 8000, 16000, 32000, 48000"; + return false; + } + + // Set aggressiveness mode (0-3) + if (fvad_set_mode(mVoiceDetector, mProps.mode) < 0) { + LOG_ERROR << "Invalid aggressiveness mode: " << mProps.mode; + return false; + } + + LOG_INFO << "=== VAD Configuration ==="; + LOG_INFO << "Sample Rate: " << mProps.sampleRate << " Hz"; + LOG_INFO << "Aggressiveness Mode: " << mProps.mode << " (0=Quality, 3=Very Aggressive)"; + LOG_INFO << "Frame Length: " << mProps.frameLength << " ms"; + LOG_INFO << "Expected samples per frame: " << (mProps.sampleRate * mProps.frameLength / 1000); + LOG_INFO << "========================"; + + mFrameCount = 0; // Reset frame counter + return true; + } + + int processAudio(const int16_t* samples, size_t length) + { + if (!mVoiceDetector) { + LOG_ERROR << "VAD not initialized"; + return -1; + } + // Returns: 1 = speech, 0 = silence, -1 = error + int result = fvad_process(mVoiceDetector, samples, length); + + if (result < 0) { + LOG_ERROR << "fvad_process failed. Check frame length matches expected size."; + LOG_ERROR << "Expected samples for " << mProps.frameLength << "ms at " + << mProps.sampleRate << "Hz: " + << (mProps.sampleRate * mProps.frameLength / 1000); + LOG_ERROR << "Actual samples received: " << length; + } + + mFrameCount++; + + return result; + } + + void setProps(VoiceActivityDetectorProps& props) + { + mProps = props; + } + +public: + // Detail class definition + framemetadata_sp mAudioMetadata; + framemetadata_sp mVadMetadata; + std::string mAudioPinId; + std::string mVadPinId; + VoiceActivityDetectorProps mProps; + +private: + Fvad* mVoiceDetector; + size_t mFrameCount; +}; + +VoiceActivityDetector::VoiceActivityDetector(VoiceActivityDetectorProps _props) + : Module(TRANSFORM, "VoiceActivityDetector", _props) +{ + mDetail.reset(new Detail(_props)); +} + +VoiceActivityDetector::~VoiceActivityDetector() {} + +bool VoiceActivityDetector::validateInputPins() +{ + if (getNumberOfInputPins() != 1) { + LOG_ERROR << "<" << getId() << ">::validateInputPins expects 1 input. Actual: " << getNumberOfInputPins(); + return false; + } + + framemetadata_sp metadata = getFirstInputMetadata(); + FrameMetadata::FrameType frameType = metadata->getFrameType(); + + if (frameType != FrameMetadata::AUDIO) { + LOG_ERROR << "<" << getId() << ">::validateInputPins expects AUDIO input. Actual: " << frameType; + return false; + } + + FrameMetadata::MemType memType = metadata->getMemType(); + if (memType != FrameMetadata::MemType::HOST) { + LOG_ERROR << "<" << getId() << ">::validateInputPins expects HOST memory. Actual: " << memType; + return false; + } + + return true; +} + +bool VoiceActivityDetector::validateOutputPins() +{ + if (getNumberOfOutputPins() > 2) { + LOG_ERROR << "<" << getId() << ">::validateOutputPins expects <= 2 outputs. Actual: " << getNumberOfOutputPins(); + return false; + } + + return true; +} + +void VoiceActivityDetector::addInputPin(framemetadata_sp& metadata, string& pinId) +{ + Module::addInputPin(metadata, pinId); + + // Pin 1: Audio Passthrough + mDetail->mAudioMetadata = framemetadata_sp(new FrameMetadata(FrameMetadata::FrameType::AUDIO, FrameMetadata::MemType::HOST)); + mDetail->mAudioMetadata->copyHint(*metadata.get()); + mDetail->mAudioPinId = addOutputPin(mDetail->mAudioMetadata); + + // Pin 2: VAD Result + mDetail->mVadMetadata = framemetadata_sp(new FrameMetadata(FrameMetadata::FrameType::GENERAL, FrameMetadata::MemType::HOST)); + mDetail->mVadPinId = addOutputPin(mDetail->mVadMetadata); +} + +bool VoiceActivityDetector::init() +{ + if (!Module::init()) { + return false; + } + + return mDetail->init(); +} + +bool VoiceActivityDetector::term() +{ + return Module::term(); +} + +bool VoiceActivityDetector::process(frame_container& frames) +{ + // 1. Get input audio frame + auto frame = frames.begin()->second; + int16_t* samples = static_cast(frame->data()); + size_t sampleCount = frame->size() / 2; // Int16 = 2 bytes + + // 2. Call libfvad + int result = mDetail->processAudio(samples, sampleCount); + + // 3. Audio Passthrough + frames.insert(make_pair(mDetail->mAudioPinId, frame)); + + // 4. Create output frame with VAD result + auto outFrame = makeFrame(sizeof(int), mDetail->mVadPinId); + int vadResult = (result == 1) ? 1 : 0; + memcpy(outFrame->data(), &vadResult, sizeof(int)); + + // 5. Send output + frames.insert(make_pair(mDetail->mVadPinId, outFrame)); + send(frames); + + return true; +} + +bool VoiceActivityDetector::processSOS(frame_sp& frame) +{ + auto metadata = frame->getMetadata(); + setMetadata(metadata); + return true; +} + +void VoiceActivityDetector::setMetadata(framemetadata_sp& metadata) +{ + if (!metadata->isSet()) { + return; + } +} + +VoiceActivityDetectorProps VoiceActivityDetector::getProps() +{ + fillProps(mDetail->mProps); + return mDetail->mProps; +} + +void VoiceActivityDetector::setProps(VoiceActivityDetectorProps& props) +{ + Module::addPropsToQueue(props); +} + +bool VoiceActivityDetector::handlePropsChange(frame_sp& frame) +{ + auto ret = Module::handlePropsChange(frame, mDetail->mProps); + mDetail->setProps(mDetail->mProps); + + mDetail->init(); + + return ret; +} + +bool VoiceActivityDetector::processEOS(string& pinId) +{ + return true; +} diff --git a/base/test/VoiceActivityDetectorTests.cpp b/base/test/VoiceActivityDetectorTests.cpp new file mode 100644 index 000000000..f121371d3 --- /dev/null +++ b/base/test/VoiceActivityDetectorTests.cpp @@ -0,0 +1,191 @@ +#include "stdafx.h" +#include +#include +#include "VoiceActivityDetector.h" +#include "AudioCaptureSrc.h" +#include "FileWriterModule.h" +#include "PipeLine.h" +#include "Logger.h" +#include "test_utils.h" + +BOOST_AUTO_TEST_SUITE(voice_activity_detector_tests) + +BOOST_AUTO_TEST_CASE(voice_activity_detector_basic_test) +{ + Logger::setLogLevel(boost::log::trivial::severity_level::info); + LOG_INFO << "Starting Voice Activity Detector Basic Test..."; + + int sampleRate = 16000; + int channels = 1; + int processingInterval = 10; + + AudioCaptureSrcProps audioProps(sampleRate, channels, 0, processingInterval); + auto audioSrc = boost::shared_ptr(new AudioCaptureSrc(audioProps)); + + VoiceActivityDetectorProps vadProps( + 16000, + VoiceActivityDetectorProps::VERY_AGGRESSIVE, + VoiceActivityDetectorProps::FRAME_10MS + ); + auto vad = boost::shared_ptr(new VoiceActivityDetector(vadProps)); + + // Sink 1: Audio Passthrough + std::string fileAudio = "./data/basic_test_audio.raw"; + FileWriterModuleProps sinkPropsAudio(fileAudio, true); + auto sinkAudio = boost::shared_ptr(new FileWriterModule(sinkPropsAudio)); + + // Sink 2: VAD Result + std::string fileVad = "./data/basic_test_vad.raw"; + FileWriterModuleProps sinkPropsVad(fileVad, true); + auto sinkVad = boost::shared_ptr(new FileWriterModule(sinkPropsVad)); + + audioSrc->setNext(vad); + + auto audioPins = vad->getAllOutputPinsByType(FrameMetadata::FrameType::AUDIO); + auto vadPins = vad->getAllOutputPinsByType(FrameMetadata::FrameType::GENERAL); + + vad->setNext(sinkAudio, audioPins); + vad->setNext(sinkVad, vadPins); + + PipeLine p("VoiceActivityDetectorTestPipeline"); + p.appendModule(audioSrc); + + LOG_INFO << "Initializing Pipeline..."; + BOOST_TEST(p.init()); + + LOG_INFO << "Running for 10 seconds..."; + LOG_INFO << "First 5 seconds: STAY SILENT (expect mostly 0s)"; + LOG_INFO << "Last 5 seconds: SPEAK INTO MIC (expect mostly 1s)"; + + p.run_all_threaded(); + boost::this_thread::sleep_for(boost::chrono::seconds(10)); + + LOG_INFO << "Stopping Pipeline..."; + p.stop(); + p.term(); + p.wait_for_all(); +} + +BOOST_AUTO_TEST_CASE(voice_activity_detector_aggressiveness_test) +{ + Logger::setLogLevel(boost::log::trivial::severity_level::info); + LOG_INFO << "Testing different aggressiveness modes..."; + + // Test all 4 aggressiveness modes + VoiceActivityDetectorProps::AggressivenessMode modes[] = { + VoiceActivityDetectorProps::QUALITY, + VoiceActivityDetectorProps::LOW_BITRATE, + VoiceActivityDetectorProps::AGGRESSIVE, + VoiceActivityDetectorProps::VERY_AGGRESSIVE + }; + + for (int i = 0; i < 4; i++) { + LOG_INFO << "Testing mode " << i << "..."; + + AudioCaptureSrcProps audioProps(16000, 1, 0, 10); + auto source = boost::shared_ptr(new AudioCaptureSrc(audioProps)); + + VoiceActivityDetectorProps props(16000, modes[i], VoiceActivityDetectorProps::FRAME_10MS); + auto vad = boost::shared_ptr(new VoiceActivityDetector(props)); + + source->setNext(vad); + + BOOST_TEST(source->init()); + BOOST_TEST(vad->init()); + + // Verify the mode was set correctly + auto currentProps = vad->getProps(); + BOOST_TEST(currentProps.mode == modes[i]); + + vad->term(); + source->term(); + } + + LOG_INFO << "All aggressiveness modes initialized successfully"; +} + +BOOST_AUTO_TEST_CASE(voice_activity_detector_props_test) +{ + Logger::setLogLevel(boost::log::trivial::severity_level::info); + LOG_INFO << "Testing getProps and setProps..."; + + // 1. Setup helper source to satisfy input pin requirement + AudioCaptureSrcProps audioProps(16000, 1, 0, 10); + auto source = boost::shared_ptr(new AudioCaptureSrc(audioProps)); + + VoiceActivityDetectorProps initialProps( + 16000, + VoiceActivityDetectorProps::QUALITY, + VoiceActivityDetectorProps::FRAME_10MS + ); + auto vad = boost::shared_ptr(new VoiceActivityDetector(initialProps)); + + // Connect source to vad so validation passes (expects 1 input pin) + source->setNext(vad); + + // Create two sinks for the two output pins (Audio, VAD) + std::string filenameAudio = "./data/test_audio.raw"; + FileWriterModuleProps sinkPropsAudio(filenameAudio, true); + auto sinkAudio = boost::shared_ptr(new FileWriterModule(sinkPropsAudio)); + + std::string filenameVad = "./data/test_vad.raw"; + FileWriterModuleProps sinkPropsVad(filenameVad, true); + auto sinkVad = boost::shared_ptr(new FileWriterModule(sinkPropsVad)); + + // Connect VAD outputs + // This ensures both output pins are connected + vad->setNext(sinkAudio); + vad->setNext(sinkVad); + + // Initialize + if (!source->init()) + { + BOOST_ERROR("Source init failed"); + return; + } + if (!sinkAudio->init()) + { + BOOST_ERROR("SinkAudio init failed"); + return; + } + if (!sinkVad->init()) + { + BOOST_ERROR("SinkVad init failed"); + return; + } + if (!vad->init()) + { + BOOST_ERROR("VAD init failed"); + return; + } + + // Verify initial props + auto currentProps = vad->getProps(); + BOOST_TEST(currentProps.sampleRate == 16000); + BOOST_TEST(currentProps.mode == VoiceActivityDetectorProps::QUALITY); + BOOST_TEST(currentProps.frameLength == VoiceActivityDetectorProps::FRAME_10MS); + + // Change props + VoiceActivityDetectorProps newProps( + 16000, + VoiceActivityDetectorProps::VERY_AGGRESSIVE, + VoiceActivityDetectorProps::FRAME_20MS + ); + + // setProps is async - it queues a command + vad->setProps(newProps); + + vad->step(); + + // Verify updated props + currentProps = vad->getProps(); + BOOST_TEST(currentProps.mode == VoiceActivityDetectorProps::VERY_AGGRESSIVE); + BOOST_TEST(currentProps.frameLength == VoiceActivityDetectorProps::FRAME_20MS); + + vad->term(); + source->term(); + sinkAudio->term(); + sinkVad->term(); +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/base/test/view_voice_activity_detector_output.py b/base/test/view_voice_activity_detector_output.py new file mode 100644 index 000000000..84f53acd3 --- /dev/null +++ b/base/test/view_voice_activity_detector_output.py @@ -0,0 +1,71 @@ +import struct +import sys +import os + +def visualize_vad(filename): + if not os.path.exists(filename): + print(f"Error: File '{filename}' not found.") + return + + print(f"Reading VAD output from: {filename}") + + with open(filename, 'rb') as f: + data = f.read() + + + num_samples = len(data) // 4 + print(f"Total Frames: {num_samples}") + + valid_bytes_len = num_samples * 4 + if len(data) != valid_bytes_len: + print(f"Warning: File size {len(data)} is not a multiple of 4. Truncating {len(data) - valid_bytes_len} bytes.") + data = data[:valid_bytes_len] + + + values = struct.unpack(f'<{num_samples}i', data) + + + zeros = values.count(0) + ones = values.count(1) + + print("-" * 40) + print(f"Silence Frames (0): {zeros}") + print(f"Speech Frames (1): {ones}") + if num_samples > 0: + print(f"Speech Activity: {(ones / num_samples) * 100:.2f}%") + print("-" * 40) + + chunk_size = 10 + print(f"\nTimeline (each char = {chunk_size} frames = {chunk_size*10}ms):") + print("Legend: '_' = Silence, '#' = Speech, '.' = Mixed") + + timeline = "" + for i in range(0, num_samples, chunk_size): + chunk = values[i:i+chunk_size] + chunk_sum = sum(chunk) + + if chunk_sum == 0: + timeline += "_" + elif chunk_sum == len(chunk): + timeline += "#" + else: + timeline += "." + + if len(timeline) >= 64: + print(timeline) + timeline = "" + + if timeline: + print(timeline) + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: python view_vad.py ") + print("Example: python view_vad.py vad_output.raw") + + default_path = "data/vad_output.raw" + if os.path.exists(default_path): + print(f"\nNo file specified. Found default: {default_path}") + visualize_vad(default_path) + else: + visualize_vad(sys.argv[1]) diff --git a/base/vcpkg.json b/base/vcpkg.json index 02da3473d..87cc38cfa 100644 --- a/base/vcpkg.json +++ b/base/vcpkg.json @@ -14,20 +14,11 @@ } ], "dependencies": [ - { - "name": "whisper", - "default-features": false, - "features": [ - "cuda" - ] - }, { "name": "opencv4", "default-features": false, "features": [ "contrib", - "cuda", - "cudnn", "dnn", "jpeg", "nonfree", @@ -61,6 +52,8 @@ "zlib", "sfml", "brotli", + "whisper", + "libfvad", { "name": "gtk3", "platform": "!windows" @@ -99,4 +92,4 @@ "name": "libmp4" } ] -} +} \ No newline at end of file diff --git a/thirdparty/custom-overlay/libfvad/portfile.cmake b/thirdparty/custom-overlay/libfvad/portfile.cmake new file mode 100644 index 000000000..cba47d067 --- /dev/null +++ b/thirdparty/custom-overlay/libfvad/portfile.cmake @@ -0,0 +1,17 @@ +vcpkg_from_github( + OUT_SOURCE_PATH SOURCE_PATH + REPO dpirch/libfvad + REF 532ab666c20d3cfda38bca63abbb0f152706c369 + SHA512 926fb7155aae7a4ca6caf8e31a06e96125f8becda45bbb1218b2d2941b4ebf4e90d8552718e497b80a90d21a6813165d5e217cc354919eea4f2297d89226ed86 + HEAD_REF master +) + +vcpkg_cmake_configure( + SOURCE_PATH "${SOURCE_PATH}" +) + +vcpkg_cmake_install() +vcpkg_copy_pdbs() + + +file(INSTALL "${SOURCE_PATH}/LICENSE" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}" RENAME copyright) diff --git a/thirdparty/custom-overlay/libfvad/vcpkg.json b/thirdparty/custom-overlay/libfvad/vcpkg.json new file mode 100644 index 000000000..b1bde443c --- /dev/null +++ b/thirdparty/custom-overlay/libfvad/vcpkg.json @@ -0,0 +1,16 @@ +{ + "name": "libfvad", + "version": "1.0.1", + "description": "Voice activity detection (VAD) library, based on WebRTC's VAD engine.", + "homepage": "https://github.com/dpirch/libfvad", + "dependencies": [ + { + "name": "vcpkg-cmake", + "host": true + }, + { + "name": "vcpkg-cmake-config", + "host": true + } + ] +} \ No newline at end of file From 85781d9f803ae01129bc30acab2ca243c0878188 Mon Sep 17 00:00:00 2001 From: Srishti Karanth Date: Mon, 22 Dec 2025 13:25:17 +0530 Subject: [PATCH 4/4] added the speechOnlyMode bool to get the .raw file with only speeh false:audio has booth speech and silence true:audio has only speech --- base/include/VoiceActivityDetector.h | 4 +- base/src/VoiceActivityDetector.cpp | 30 +++++++++--- base/test/VoiceActivityDetectorTests.cpp | 58 ++++++++++++++++++------ 3 files changed, 70 insertions(+), 22 deletions(-) diff --git a/base/include/VoiceActivityDetector.h b/base/include/VoiceActivityDetector.h index a64a71463..f7c8e7d64 100644 --- a/base/include/VoiceActivityDetector.h +++ b/base/include/VoiceActivityDetector.h @@ -26,12 +26,14 @@ class VoiceActivityDetectorProps : public ModuleProps VoiceActivityDetectorProps( int _sampleRate = 16000, AggressivenessMode _mode = QUALITY, - FrameLength _frameLength = FRAME_10MS + FrameLength _frameLength = FRAME_10MS, + bool _speechOnly = false // When true, only outputs audio when speech is detected ); int sampleRate; AggressivenessMode mode; FrameLength frameLength; + bool speechOnly; // If true, audio passthrough only sends frames with speech size_t getSerializeSize(); diff --git a/base/src/VoiceActivityDetector.cpp b/base/src/VoiceActivityDetector.cpp index 06b0f6d7d..b0dc1b383 100644 --- a/base/src/VoiceActivityDetector.cpp +++ b/base/src/VoiceActivityDetector.cpp @@ -10,17 +10,20 @@ VoiceActivityDetectorProps::VoiceActivityDetectorProps( int _sampleRate, AggressivenessMode _mode, - FrameLength _frameLength + FrameLength _frameLength, + bool _speechOnly ) : sampleRate(_sampleRate), mode(_mode), - frameLength(_frameLength) + frameLength(_frameLength), + speechOnly(_speechOnly) {} size_t VoiceActivityDetectorProps::getSerializeSize() { return ModuleProps::getSerializeSize() + sizeof(sampleRate) + sizeof(mode) + - sizeof(frameLength); + sizeof(frameLength) + + sizeof(speechOnly); } template @@ -29,6 +32,7 @@ void VoiceActivityDetectorProps::serialize(Archive& ar, const unsigned int versi ar& sampleRate; ar& mode; ar& frameLength; + ar& speechOnly; } class VoiceActivityDetector::Detail @@ -72,6 +76,7 @@ class VoiceActivityDetector::Detail LOG_INFO << "Sample Rate: " << mProps.sampleRate << " Hz"; LOG_INFO << "Aggressiveness Mode: " << mProps.mode << " (0=Quality, 3=Very Aggressive)"; LOG_INFO << "Frame Length: " << mProps.frameLength << " ms"; + LOG_INFO << "Speech Only Mode: " << (mProps.speechOnly ? "ON" : "OFF"); LOG_INFO << "Expected samples per frame: " << (mProps.sampleRate * mProps.frameLength / 1000); LOG_INFO << "========================"; @@ -198,13 +203,24 @@ bool VoiceActivityDetector::process(frame_container& frames) // 2. Call libfvad int result = mDetail->processAudio(samples, sampleCount); + bool isSpeech = (result == 1); - // 3. Audio Passthrough - frames.insert(make_pair(mDetail->mAudioPinId, frame)); + // 3. Audio Passthrough - Conditional on speechOnly mode + // speechOnly - + // false:audio has booth speech and silence + // true:audio has only speech + if (!mDetail->mProps.speechOnly ) { + frames.insert(make_pair(mDetail->mAudioPinId, frame)); + } + else { + if(isSpeech) { + frames.insert(make_pair(mDetail->mAudioPinId, frame)); + } + } - // 4. Create output frame with VAD result + // 4. Create output frame with VAD result - ALWAYS create and send auto outFrame = makeFrame(sizeof(int), mDetail->mVadPinId); - int vadResult = (result == 1) ? 1 : 0; + int vadResult = isSpeech ? 1 : 0; memcpy(outFrame->data(), &vadResult, sizeof(int)); // 5. Send output diff --git a/base/test/VoiceActivityDetectorTests.cpp b/base/test/VoiceActivityDetectorTests.cpp index f121371d3..b623bf30c 100644 --- a/base/test/VoiceActivityDetectorTests.cpp +++ b/base/test/VoiceActivityDetectorTests.cpp @@ -10,52 +10,62 @@ BOOST_AUTO_TEST_SUITE(voice_activity_detector_tests) -BOOST_AUTO_TEST_CASE(voice_activity_detector_basic_test) +// TEST: Speech-only output using speechOnly mode +BOOST_AUTO_TEST_CASE(voice_activity_filter_speech_only_test) { Logger::setLogLevel(boost::log::trivial::severity_level::info); - LOG_INFO << "Starting Voice Activity Detector Basic Test..."; + LOG_INFO << "Starting Speech-Only Mode Test..."; + LOG_INFO << "This test saves BOTH output pins for comparison"; int sampleRate = 16000; int channels = 1; int processingInterval = 10; + // Audio source AudioCaptureSrcProps audioProps(sampleRate, channels, 0, processingInterval); auto audioSrc = boost::shared_ptr(new AudioCaptureSrc(audioProps)); + // CHANGE THIS to true/false to test different modes + bool speechOnlyMode = true; + VoiceActivityDetectorProps vadProps( 16000, VoiceActivityDetectorProps::VERY_AGGRESSIVE, - VoiceActivityDetectorProps::FRAME_10MS + VoiceActivityDetectorProps::FRAME_10MS, + speechOnlyMode ); auto vad = boost::shared_ptr(new VoiceActivityDetector(vadProps)); - // Sink 1: Audio Passthrough - std::string fileAudio = "./data/basic_test_audio.raw"; + // Sink 1: Audio output + std::string fileAudio = speechOnlyMode ? "./data/speech_only_audio.raw" : "./data/all_audio.raw"; FileWriterModuleProps sinkPropsAudio(fileAudio, true); auto sinkAudio = boost::shared_ptr(new FileWriterModule(sinkPropsAudio)); - // Sink 2: VAD Result - std::string fileVad = "./data/basic_test_vad.raw"; + // Sink 2: VAD results output + std::string fileVad = "./data/vad.raw"; FileWriterModuleProps sinkPropsVad(fileVad, true); auto sinkVad = boost::shared_ptr(new FileWriterModule(sinkPropsVad)); + // Build pipeline: AudioSrc -> VAD -> Two FileWriters audioSrc->setNext(vad); + // Connect BOTH output pins auto audioPins = vad->getAllOutputPinsByType(FrameMetadata::FrameType::AUDIO); auto vadPins = vad->getAllOutputPinsByType(FrameMetadata::FrameType::GENERAL); + + vad->setNext(sinkAudio, audioPins); // Pin 1: Audio → sinkAudio + vad->setNext(sinkVad, vadPins); // Pin 2: VAD → sinkVad - vad->setNext(sinkAudio, audioPins); - vad->setNext(sinkVad, vadPins); - - PipeLine p("VoiceActivityDetectorTestPipeline"); + PipeLine p("SpeechOnlyTestPipeline"); p.appendModule(audioSrc); LOG_INFO << "Initializing Pipeline..."; BOOST_TEST(p.init()); - LOG_INFO << "Running for 10 seconds..."; - LOG_INFO << "First 5 seconds: STAY SILENT (expect mostly 0s)"; - LOG_INFO << "Last 5 seconds: SPEAK INTO MIC (expect mostly 1s)"; + LOG_INFO << "==========================================="; + LOG_INFO << "speechOnly mode: " << (speechOnlyMode ? "TRUE" : "FALSE"); + LOG_INFO << "Recording for 10 seconds..."; + LOG_INFO << "==========================================="; p.run_all_threaded(); boost::this_thread::sleep_for(boost::chrono::seconds(10)); @@ -64,6 +74,25 @@ BOOST_AUTO_TEST_CASE(voice_activity_detector_basic_test) p.stop(); p.term(); p.wait_for_all(); + + LOG_INFO << ""; + LOG_INFO << "==========================================="; + LOG_INFO << "Test complete! Files saved:"; + LOG_INFO << " Audio output: " << fileAudio; + LOG_INFO << " VAD output: " << fileVad; + LOG_INFO << ""; + LOG_INFO << "To play audio in Audacity:"; + LOG_INFO << " 1.File > Import > Raw Data"; + LOG_INFO << " 2. Select: " << fileAudio; + LOG_INFO << " 3. Settings:"; + LOG_INFO << " - Encoding: Signed 16-bit PCM"; + LOG_INFO << " - Byte order: Little-endian"; + LOG_INFO << " - Channels: 1 (Mono)"; + LOG_INFO << " - Sample rate: 16000 Hz"; + LOG_INFO << ""; + LOG_INFO << "To view VAD results:"; + LOG_INFO << " python view_voice_activity_detector_output.py " << fileVad; + LOG_INFO << "==========================================="; } BOOST_AUTO_TEST_CASE(voice_activity_detector_aggressiveness_test) @@ -189,3 +218,4 @@ BOOST_AUTO_TEST_CASE(voice_activity_detector_props_test) } BOOST_AUTO_TEST_SUITE_END() +