From ea4426085123ce13c5400020f9315231ed06317c Mon Sep 17 00:00:00 2001
From: srishti <srishtikaranth83@gmail.com>
Date: Thu, 11 Dec 2025 11:34:29 +0530
Subject: [PATCH 1/4] feat: Added libfvad library and integrated Voice Activity
 Detection added unit test  for VAD

---
 base/include/VADTransform.h  |  64 +++++++++
 base/src/VADTransform.cpp    | 254 +++++++++++++++++++++++++++++++++++
 base/test/vad_tests.cpp      |  85 ++++++++++++
 base/test/view_vad_output.py |  71 ++++++++++
 4 files changed, 474 insertions(+)
 create mode 100644 base/include/VADTransform.h
 create mode 100644 base/src/VADTransform.cpp
 create mode 100644 base/test/vad_tests.cpp
 create mode 100644 base/test/view_vad_output.py
diff --git a/base/include/VADTransform.h b/base/include/VADTransform.h
new file mode 100644
index 000000000..1b5de839a
--- /dev/null
+++ b/base/include/VADTransform.h
@@ -0,0 +1,64 @@
+#pragma once
+
+#include "Module.h"
+
+class VADTransformProps : public ModuleProps
+{
+public:
+	enum AggressivenessMode {
+		QUALITY = 0,          // Least aggressive (best quality, catches more speech)
+		LOW_BITRATE = 1,      // Moderate
+		AGGRESSIVE = 2,       // More aggressive
+		VERY_AGGRESSIVE = 3   // Most aggressive (best bandwidth saving, only clear speech)
+	};
+	
+	enum FrameLength {
+		FRAME_10MS = 10,
+		FRAME_20MS = 20,
+		FRAME_30MS = 30
+	};
+	
+	VADTransformProps(
+		int _sampleRate = 16000,
+		AggressivenessMode _mode = QUALITY,
+		FrameLength _frameLength = FRAME_10MS
+	);
+	
+	int sampleRate;
+	AggressivenessMode mode;
+	FrameLength frameLength;
+	
+	size_t getSerializeSize();
+
+private:
+	friend class boost::serialization::access;
+	
+	template <class Archive>
+	void serialize(Archive& ar, const unsigned int version);
+};
+
+class VADTransform : public Module
+{
+public:
+	VADTransform(VADTransformProps _props);
+	virtual ~VADTransform();
+	
+	bool init();
+	bool term();
+	void setProps(VADTransformProps& props);
+	VADTransformProps getProps();
+
+protected:
+	bool process(frame_container& frames);
+	bool processSOS(frame_sp& frame);
+	bool validateInputPins();
+	bool validateOutputPins();
+	void addInputPin(framemetadata_sp& metadata, string& pinId);
+	bool handlePropsChange(frame_sp& frame);
+	bool processEOS(string& pinId);
+
+private:
+	void setMetadata(framemetadata_sp& metadata);
+	class Detail;
+	boost::shared_ptr<Detail> mDetail;
+};
diff --git a/base/src/VADTransform.cpp b/base/src/VADTransform.cpp
new file mode 100644
index 000000000..9396b66f6
--- /dev/null
+++ b/base/src/VADTransform.cpp
@@ -0,0 +1,254 @@
+#include "VADTransform.h"
+#include "FrameMetadata.h"
+#include "FrameMetadataFactory.h"
+#include "Frame.h"
+#include "Logger.h"
+#include "Utils.h"
+#include "fvad.h"
+
+// VADTransformProps implementation
+VADTransformProps::VADTransformProps(
+	int _sampleRate,
+	AggressivenessMode _mode,
+	FrameLength _frameLength
+) : sampleRate(_sampleRate),
+	mode(_mode),
+	frameLength(_frameLength)
+{}
+
+size_t VADTransformProps::getSerializeSize() {
+	return ModuleProps::getSerializeSize() +
+		sizeof(sampleRate) +
+		sizeof(mode) +
+		sizeof(frameLength);
+}
+
+template <class Archive>
+void VADTransformProps::serialize(Archive& ar, const unsigned int version) {
+	ar& boost::serialization::base_object<ModuleProps>(*this);
+	ar& sampleRate;
+	ar& mode;
+	ar& frameLength;
+}
+
+// Detail class - holds libfvad instance
+class VADTransform::Detail
+{
+public:
+	Detail(VADTransformProps& _props) : mProps(_props), mVad(nullptr)
+	{
+	}
+	
+	~Detail()
+	{
+		if (mVad) {
+			fvad_free(mVad);
+			mVad = nullptr;
+		}
+	}
+	
+	bool init()
+	{
+		// Create libfvad instance
+		mVad = fvad_new();
+		if (!mVad) {
+			LOG_ERROR << "Failed to create libfvad instance";
+			return false;
+		}
+		
+		// Set sample rate (must be 8000, 16000, 32000, or 48000)
+		if (fvad_set_sample_rate(mVad, mProps.sampleRate) < 0) {
+			LOG_ERROR << "Invalid sample rate: " << mProps.sampleRate;
+			LOG_ERROR << "Valid rates are: 8000, 16000, 32000, 48000";
+			return false;
+		}
+		
+		// Set aggressiveness mode (0-3)
+		if (fvad_set_mode(mVad, mProps.mode) < 0) {
+			LOG_ERROR << "Invalid aggressiveness mode: " << mProps.mode;
+			return false;
+		}
+		
+		LOG_INFO << "=== VAD Configuration ===";
+		LOG_INFO << "Sample Rate: " << mProps.sampleRate << " Hz";
+		LOG_INFO << "Aggressiveness Mode: " << mProps.mode << " (0=Quality, 3=Very Aggressive)";
+		LOG_INFO << "Frame Length: " << mProps.frameLength << " ms";
+		LOG_INFO << "Expected samples per frame: " << (mProps.sampleRate * mProps.frameLength / 1000);
+		LOG_INFO << "========================";
+		
+		mFrameCount = 0;  // Reset frame counter
+		
+		return true;
+	}
+	
+	int processAudio(const int16_t* samples, size_t length)
+	{
+		if (!mVad) {
+			LOG_ERROR << "VAD not initialized";
+			return -1;
+		}
+		// Returns: 1 = speech, 0 = silence, -1 = error
+		int result = fvad_process(mVad, samples, length);
+		
+		if (result < 0) {
+			LOG_ERROR << "fvad_process failed. Check frame length matches expected size.";
+			LOG_ERROR << "Expected samples for " << mProps.frameLength << "ms at " 
+			          << mProps.sampleRate << "Hz: " 
+			          << (mProps.sampleRate * mProps.frameLength / 1000);
+			LOG_ERROR << "Actual samples received: " << length;
+		}
+		
+		mFrameCount++;
+		
+		return result;
+	}
+	
+	void setProps(VADTransformProps& props)
+	{
+		mProps = props;
+	}
+
+public:
+	framemetadata_sp mOutputMetadata;
+	std::string mOutputPinId;
+	VADTransformProps mProps;
+
+private:
+	Fvad* mVad; 
+	size_t mFrameCount;  
+};
+
+// VADTransform implementation
+VADTransform::VADTransform(VADTransformProps _props) 
+	: Module(TRANSFORM, "VADTransform", _props)
+{
+	mDetail.reset(new Detail(_props));
+}
+
+VADTransform::~VADTransform() {}
+
+bool VADTransform::validateInputPins()
+{
+	if (getNumberOfInputPins() != 1) {
+		LOG_ERROR << "<" << getId() << ">::validateInputPins expects 1 input. Actual: " << getNumberOfInputPins();
+		return false;
+	}
+	
+	framemetadata_sp metadata = getFirstInputMetadata();
+	FrameMetadata::FrameType frameType = metadata->getFrameType();
+	
+	if (frameType != FrameMetadata::AUDIO) {
+		LOG_ERROR << "<" << getId() << ">::validateInputPins expects AUDIO input. Actual: " << frameType;
+		return false;
+	}
+	
+	FrameMetadata::MemType memType = metadata->getMemType();
+	if (memType != FrameMetadata::MemType::HOST) {
+		LOG_ERROR << "<" << getId() << ">::validateInputPins expects HOST memory. Actual: " << memType;
+		return false;
+	}
+	
+	return true;
+}
+
+bool VADTransform::validateOutputPins()
+{
+	if (getNumberOfOutputPins() != 1) {
+		LOG_ERROR << "<" << getId() << ">::validateOutputPins expects 1 output. Actual: " << getNumberOfOutputPins();
+		return false;
+	}
+	
+	framemetadata_sp metadata = getFirstOutputMetadata();
+	FrameMetadata::FrameType frameType = metadata->getFrameType();
+
+	if (frameType != FrameMetadata::GENERAL) {
+		LOG_ERROR << "<" << getId() << ">::validateOutputPins expects GENERAL output. Actual: " << frameType;
+		return false;
+	}
+	
+	return true;
+}
+
+void VADTransform::addInputPin(framemetadata_sp& metadata, string& pinId)
+{
+	Module::addInputPin(metadata, pinId);
+	mDetail->mOutputMetadata = framemetadata_sp(new FrameMetadata(FrameMetadata::FrameType::GENERAL));
+	mDetail->mOutputMetadata->copyHint(*metadata.get());
+	mDetail->mOutputPinId = addOutputPin(mDetail->mOutputMetadata);
+}
+
+bool VADTransform::init()
+{
+	if (!Module::init()) {
+		return false;
+	}
+	
+	return mDetail->init();
+}
+
+bool VADTransform::term()
+{
+	return Module::term();
+}
+
+bool VADTransform::process(frame_container& frames)
+{
+	// 1. Get input audio frame
+	auto frame = frames.begin()->second;
+	int16_t* samples = static_cast<int16_t*>(frame->data());
+	size_t sampleCount = frame->size() / 2;  // Int16 = 2 bytes
+	
+	// 2. Call libfvad
+	int result = mDetail->processAudio(samples, sampleCount);
+	
+	// 3. Create output frame with VAD result
+	auto outFrame = makeFrame(sizeof(int));
+	int vadResult = (result == 1) ? 1 : 0;  
+	memcpy(outFrame->data(), &vadResult, sizeof(int));
+	
+	// 4. Send output
+	frames.insert(make_pair(mDetail->mOutputPinId, outFrame));
+	send(frames);
+	
+	return true;
+}
+
+bool VADTransform::processSOS(frame_sp& frame)
+{
+	auto metadata = frame->getMetadata();
+	setMetadata(metadata);
+	return true;
+}
+
+void VADTransform::setMetadata(framemetadata_sp& metadata)
+{
+	if (!metadata->isSet()) {
+		return;
+	}
+}
+
+VADTransformProps VADTransform::getProps()
+{
+	fillProps(mDetail->mProps);
+	return mDetail->mProps;
+}
+
+void VADTransform::setProps(VADTransformProps& props)
+{
+	Module::addPropsToQueue(props);
+}
+
+bool VADTransform::handlePropsChange(frame_sp& frame)
+{
+	auto ret = Module::handlePropsChange(frame, mDetail->mProps);
+	mDetail->setProps(mDetail->mProps);
+	
+	mDetail->init();
+	
+	return ret;
+}
+
+bool VADTransform::processEOS(string& pinId)
+{
+	return true;
+}
diff --git a/base/test/vad_tests.cpp b/base/test/vad_tests.cpp
new file mode 100644
index 000000000..f10134800
--- /dev/null
+++ b/base/test/vad_tests.cpp
@@ -0,0 +1,85 @@
+#include "stdafx.h"
+#include <boost/test/unit_test.hpp>
+#include <fstream>
+#include "VADTransform.h"
+#include "AudioCaptureSrc.h"
+#include "FileWriterModule.h"
+#include "PipeLine.h"
+#include "Logger.h"
+#include "test_utils.h"
+
+BOOST_AUTO_TEST_SUITE(vad_tests)
+
+BOOST_AUTO_TEST_CASE(vad_basic_test, *boost::unit_test::disabled())
+{
+	Logger::setLogLevel(boost::log::trivial::severity_level::info);
+	LOG_INFO << "Starting VAD Basic Test...";
+
+	int sampleRate = 16000;
+	int channels = 1;  
+	int processingInterval = 10;  
+	
+	AudioCaptureSrcProps audioProps(sampleRate, channels, 0, processingInterval);
+	auto audioSrc = boost::shared_ptr<AudioCaptureSrc>(new AudioCaptureSrc(audioProps));
+
+	VADTransformProps vadProps(
+		16000,  
+		VADTransformProps::QUALITY, 
+		VADTransformProps::FRAME_10MS  
+	);
+	auto vad = boost::shared_ptr<VADTransform>(new VADTransform(vadProps));
+
+	std::string filename = "./data/vad_output.raw";
+	FileWriterModuleProps sinkProps(filename, true);
+	auto sink = boost::shared_ptr<FileWriterModule>(new FileWriterModule(sinkProps));
+
+	audioSrc->setNext(vad);
+	vad->setNext(sink);
+
+	PipeLine p("VADTestPipeline");
+	p.appendModule(audioSrc);
+
+	LOG_INFO << "Initializing Pipeline...";
+	BOOST_TEST(p.init());
+
+	LOG_INFO << "Running for 10 seconds...";
+	LOG_INFO << "First 5 seconds: STAY SILENT (expect mostly 0s)";
+	LOG_INFO << "Last 5 seconds: SPEAK INTO MIC (expect mostly 1s)";
+	
+	p.run_all_threaded();
+	boost::this_thread::sleep_for(boost::chrono::seconds(10));
+
+	LOG_INFO << "Stopping Pipeline...";
+	p.stop();
+	p.term();
+	p.wait_for_all();
+}
+
+BOOST_AUTO_TEST_CASE(vad_aggressiveness_test, *boost::unit_test::disabled())
+{
+	Logger::setLogLevel(boost::log::trivial::severity_level::info);
+	LOG_INFO << "Testing different aggressiveness modes...";
+
+	// Test all 4 aggressiveness modes
+	VADTransformProps::AggressivenessMode modes[] = {
+		VADTransformProps::QUALITY,
+		VADTransformProps::LOW_BITRATE,
+		VADTransformProps::AGGRESSIVE,
+		VADTransformProps::VERY_AGGRESSIVE
+	};
+
+	for (int i = 0; i < 4; i++) {
+		LOG_INFO << "Testing mode " << i << "...";
+		
+		VADTransformProps props(16000, modes[i], VADTransformProps::FRAME_10MS);
+		auto vad = boost::shared_ptr<VADTransform>(new VADTransform(props));
+		
+		// Just verify it initializes
+		BOOST_TEST(vad->init());
+		vad->term();
+	}
+	
+	LOG_INFO << "All aggressiveness modes initialized successfully";
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/base/test/view_vad_output.py b/base/test/view_vad_output.py
new file mode 100644
index 000000000..84f53acd3
--- /dev/null
+++ b/base/test/view_vad_output.py
@@ -0,0 +1,71 @@
+import struct
+import sys
+import os
+
+def visualize_vad(filename):
+    if not os.path.exists(filename):
+        print(f"Error: File '{filename}' not found.")
+        return
+
+    print(f"Reading VAD output from: {filename}")
+    
+    with open(filename, 'rb') as f:
+        data = f.read()
+
+   
+    num_samples = len(data) // 4
+    print(f"Total Frames: {num_samples}")
+    
+    valid_bytes_len = num_samples * 4
+    if len(data) != valid_bytes_len:
+        print(f"Warning: File size {len(data)} is not a multiple of 4. Truncating {len(data) - valid_bytes_len} bytes.")
+        data = data[:valid_bytes_len]
+
+   
+    values = struct.unpack(f'<{num_samples}i', data)
+
+
+    zeros = values.count(0)
+    ones = values.count(1)
+    
+    print("-" * 40)
+    print(f"Silence Frames (0): {zeros}")
+    print(f"Speech Frames  (1): {ones}")
+    if num_samples > 0:
+        print(f"Speech Activity:    {(ones / num_samples) * 100:.2f}%")
+    print("-" * 40)
+
+    chunk_size = 10 
+    print(f"\nTimeline (each char = {chunk_size} frames = {chunk_size*10}ms):")
+    print("Legend: '_' = Silence, '#' = Speech, '.' = Mixed")
+    
+    timeline = ""
+    for i in range(0, num_samples, chunk_size):
+        chunk = values[i:i+chunk_size]
+        chunk_sum = sum(chunk)
+        
+        if chunk_sum == 0:
+            timeline += "_"
+        elif chunk_sum == len(chunk):
+            timeline += "#"
+        else:
+            timeline += "."
+            
+        if len(timeline) >= 64:
+            print(timeline)
+            timeline = ""
+            
+    if timeline:
+        print(timeline)
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python view_vad.py <path_to_raw_file>")
+        print("Example: python view_vad.py vad_output.raw")
+        
+        default_path = "data/vad_output.raw"
+        if os.path.exists(default_path):
+            print(f"\nNo file specified. Found default: {default_path}")
+            visualize_vad(default_path)
+    else:
+        visualize_vad(sys.argv[1])

From 240d44379773bd0bced877f9a6026da06753eafb Mon Sep 17 00:00:00 2001
From: srishti <srishtikaranth83@gmail.com>
Date: Thu, 11 Dec 2025 11:53:21 +0530
Subject: [PATCH 2/4] changes in CMakelist

---
 base/CMakeLists.txt | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/base/CMakeLists.txt b/base/CMakeLists.txt
index 1c964a2b7..022bb94ec 100755
--- a/base/CMakeLists.txt
+++ b/base/CMakeLists.txt
@@ -165,6 +165,9 @@ ENDIF(ENABLE_CUDA)
 
 include_directories(AFTER SYSTEM include)
 
+add_subdirectory(../thirdparty/libfvad libfvad)
+include_directories(AFTER SYSTEM ../thirdparty/libfvad/include)
+
 # ApraPipes library
 
 SET(CORE_FILES
@@ -339,6 +342,7 @@ SET(IP_FILES
 	src/OverlayFactory.cpp
 	src/TestSignalGeneratorSrc.cpp
 	src/AudioToTextXForm.cpp 
+	src/VADTransform.cpp
 	src/AbsControlModule.cpp
 	src/ThumbnailListGenerator.cpp
 )
@@ -366,6 +370,7 @@ SET(IP_FILES_H
 	include/ColorConversionXForm.h
 	include/Overlay.h
 	include/AudioToTextXForm.h
+	include/VADTransform.h
 	include/ThumbnailListGenerator.h
 )
 
@@ -627,6 +632,7 @@ SET(UT_FILES
 	test/mp4_dts_strategy_tests.cpp
 	test/overlaymodule_tests.cpp
 	test/testSignalGeneratorSrc_tests.cpp
+	test/vad_tests.cpp
 	test/audioToTextXform_tests.cpp
 	test/simpleControlModuleTests.cpp
 	${ARM64_UT_FILES}
@@ -692,6 +698,7 @@ target_link_libraries(aprapipesut
   bigint::bigint
   sfml-audio
   whisper::whisper
+  fvad
   )
 
 IF(ENABLE_WINDOWS)

From 2f60f6bb6a7dc03649452294f9e1a6630a2f232e Mon Sep 17 00:00:00 2001
From: Srishti Karanth <srishtik@apra.in>
Date: Thu, 18 Dec 2025 16:42:52 +0530
Subject: [PATCH 3/4] worked on pr comments: 1. renamed vad ->voice activity
 detector 2. bringing fvad from vcpkg 3 . made changes in unit test 4. sending
 input frame also , along with Voice Detection info

---
 base/CMakeLists.txt                           |  11 +-
 base/include/VoiceActivityDetector.h          |  69 +++++
 base/src/VoiceActivityDetector.cpp            | 255 ++++++++++++++++++
 base/test/VoiceActivityDetectorTests.cpp      | 191 +++++++++++++
 .../view_voice_activity_detector_output.py    |  71 +++++
 base/vcpkg.json                               |  13 +-
 .../custom-overlay/libfvad/portfile.cmake     |  17 ++
 thirdparty/custom-overlay/libfvad/vcpkg.json  |  16 ++
 8 files changed, 629 insertions(+), 14 deletions(-)
 create mode 100644 base/include/VoiceActivityDetector.h
 create mode 100644 base/src/VoiceActivityDetector.cpp
 create mode 100644 base/test/VoiceActivityDetectorTests.cpp
 create mode 100644 base/test/view_voice_activity_detector_output.py
 create mode 100644 thirdparty/custom-overlay/libfvad/portfile.cmake
 create mode 100644 thirdparty/custom-overlay/libfvad/vcpkg.json

diff --git a/base/CMakeLists.txt b/base/CMakeLists.txt
index 022bb94ec..c0df3fb19 100755
--- a/base/CMakeLists.txt
+++ b/base/CMakeLists.txt
@@ -50,6 +50,8 @@ find_package(ZXing CONFIG REQUIRED)
 find_package(bigint CONFIG REQUIRED)
 find_package(SFML COMPONENTS system window audio graphics CONFIG REQUIRED)
 find_package(whisper CONFIG REQUIRED)
+find_path(FVAD_INCLUDE_DIR fvad.h)
+find_library(FVAD_LIBRARY NAMES fvad libfvad)
 
 
 IF(ENABLE_LINUX)
@@ -342,7 +344,7 @@ SET(IP_FILES
 	src/OverlayFactory.cpp
 	src/TestSignalGeneratorSrc.cpp
 	src/AudioToTextXForm.cpp 
-	src/VADTransform.cpp
+	src/VoiceActivityDetector.cpp
 	src/AbsControlModule.cpp
 	src/ThumbnailListGenerator.cpp
 )
@@ -370,7 +372,7 @@ SET(IP_FILES_H
 	include/ColorConversionXForm.h
 	include/Overlay.h
 	include/AudioToTextXForm.h
-	include/VADTransform.h
+	include/VoiceActivityDetector.h
 	include/ThumbnailListGenerator.h
 )
 
@@ -528,6 +530,7 @@ target_include_directories ( aprapipes PRIVATE
 	${BARESIP_INC_DIR}
 	${LIBRE_INC_DIR}
 	${NVCODEC_INCLUDE_DIR}
+	${FVAD_INCLUDE_DIR}
 )
 
 
@@ -632,7 +635,7 @@ SET(UT_FILES
 	test/mp4_dts_strategy_tests.cpp
 	test/overlaymodule_tests.cpp
 	test/testSignalGeneratorSrc_tests.cpp
-	test/vad_tests.cpp
+	test/VoiceActivityDetectorTests.cpp
 	test/audioToTextXform_tests.cpp
 	test/simpleControlModuleTests.cpp
 	${ARM64_UT_FILES}
@@ -698,7 +701,7 @@ target_link_libraries(aprapipesut
   bigint::bigint
   sfml-audio
   whisper::whisper
-  fvad
+  ${FVAD_LIBRARY}
   )
 
 IF(ENABLE_WINDOWS)
diff --git a/base/include/VoiceActivityDetector.h b/base/include/VoiceActivityDetector.h
new file mode 100644
index 000000000..a64a71463
--- /dev/null
+++ b/base/include/VoiceActivityDetector.h
@@ -0,0 +1,69 @@
+#pragma once
+
+#include "Module.h"
+
+class VoiceActivityDetectorProps : public ModuleProps
+{
+public:
+
+	//decides speech vs no-speech
+	enum AggressivenessMode {
+		QUALITY = 0,          // Least aggressive (best quality, catches more speech)
+		LOW_BITRATE = 1,      // Moderate
+		AGGRESSIVE = 2,       // More aggressive
+		VERY_AGGRESSIVE = 3   // Most aggressive (best bandwidth saving, only clear speech)
+	};
+	
+	// audio time the VAD analyzes at once
+	// default sample rate is 16 kHz (16,000 samples/sec)
+	// 10ms - 160 samples  20ms - 320 samples  30ms - 480 samples
+	enum FrameLength {
+		FRAME_10MS = 10,      // Lowest latency
+		FRAME_20MS = 20,      // Balanced
+		FRAME_30MS = 30       // Best accuracy
+	};
+	
+	VoiceActivityDetectorProps(
+		int _sampleRate = 16000,
+		AggressivenessMode _mode = QUALITY,
+		FrameLength _frameLength = FRAME_10MS
+	);
+	
+	int sampleRate;
+	AggressivenessMode mode;
+	FrameLength frameLength;
+	
+	size_t getSerializeSize();
+
+private:
+	friend class boost::serialization::access;
+	
+	template <class Archive>
+	void serialize(Archive& ar, const unsigned int version);
+};
+
+class VoiceActivityDetector : public Module
+{
+public:
+	VoiceActivityDetector(VoiceActivityDetectorProps _props);
+	virtual ~VoiceActivityDetector();
+	
+	bool init();
+	bool term();
+	void setProps(VoiceActivityDetectorProps& props);
+	VoiceActivityDetectorProps getProps();
+
+protected:
+	bool process(frame_container& frames);
+	bool processSOS(frame_sp& frame);
+	bool validateInputPins();
+	bool validateOutputPins();
+	void addInputPin(framemetadata_sp& metadata, string& pinId);
+	bool handlePropsChange(frame_sp& frame);
+	bool processEOS(string& pinId);
+
+private:
+	void setMetadata(framemetadata_sp& metadata);
+	class Detail;
+	boost::shared_ptr<Detail> mDetail;
+};
diff --git a/base/src/VoiceActivityDetector.cpp b/base/src/VoiceActivityDetector.cpp
new file mode 100644
index 000000000..06b0f6d7d
--- /dev/null
+++ b/base/src/VoiceActivityDetector.cpp
@@ -0,0 +1,255 @@
+#include "VoiceActivityDetector.h"
+#include "FrameMetadata.h"
+#include "FrameMetadataFactory.h"
+#include "Frame.h"
+#include "Logger.h"
+#include "Utils.h"
+#include "fvad.h"
+
+// VoiceActivityDetectorProps implementation
+VoiceActivityDetectorProps::VoiceActivityDetectorProps(
+	int _sampleRate,
+	AggressivenessMode _mode,
+	FrameLength _frameLength
+) : sampleRate(_sampleRate),
+	mode(_mode),
+	frameLength(_frameLength)
+{}
+
+size_t VoiceActivityDetectorProps::getSerializeSize() {
+	return ModuleProps::getSerializeSize() +
+		sizeof(sampleRate) +
+		sizeof(mode) +
+		sizeof(frameLength);
+}
+
+template <class Archive>
+void VoiceActivityDetectorProps::serialize(Archive& ar, const unsigned int version) {
+	ar& boost::serialization::base_object<ModuleProps>(*this);
+	ar& sampleRate;
+	ar& mode;
+	ar& frameLength;
+}
+
+class VoiceActivityDetector::Detail
+{
+public:
+	Detail(VoiceActivityDetectorProps& _props) : mProps(_props), mVoiceDetector(nullptr)
+	{
+	}
+	
+	~Detail()
+	{
+		if (mVoiceDetector) {
+			fvad_free(mVoiceDetector);
+			mVoiceDetector = nullptr;
+		}
+	}
+	
+	bool init()
+	{
+		// Create libfvad instance
+		mVoiceDetector = fvad_new();
+		if (!mVoiceDetector) {
+			LOG_ERROR << "Failed to create libfvad instance";
+			return false;
+		}
+		
+		// Set sample rate (must be 8000, 16000, 32000, or 48000)
+		if (fvad_set_sample_rate(mVoiceDetector, mProps.sampleRate) < 0) {
+			LOG_ERROR << "Invalid sample rate: " << mProps.sampleRate;
+			LOG_ERROR << "Valid rates are: 8000, 16000, 32000, 48000";
+			return false;
+		}
+		
+		// Set aggressiveness mode (0-3)
+		if (fvad_set_mode(mVoiceDetector, mProps.mode) < 0) {
+			LOG_ERROR << "Invalid aggressiveness mode: " << mProps.mode;
+			return false;
+		}
+		
+		LOG_INFO << "=== VAD Configuration ===";
+		LOG_INFO << "Sample Rate: " << mProps.sampleRate << " Hz";
+		LOG_INFO << "Aggressiveness Mode: " << mProps.mode << " (0=Quality, 3=Very Aggressive)";
+		LOG_INFO << "Frame Length: " << mProps.frameLength << " ms";
+		LOG_INFO << "Expected samples per frame: " << (mProps.sampleRate * mProps.frameLength / 1000);
+		LOG_INFO << "========================";
+		
+		mFrameCount = 0;  // Reset frame counter
+		return true;
+	}
+	
+	int processAudio(const int16_t* samples, size_t length)
+	{
+		if (!mVoiceDetector) {
+			LOG_ERROR << "VAD not initialized";
+			return -1;
+		}
+		// Returns: 1 = speech, 0 = silence, -1 = error
+		int result = fvad_process(mVoiceDetector, samples, length);
+		
+		if (result < 0) {
+			LOG_ERROR << "fvad_process failed. Check frame length matches expected size.";
+			LOG_ERROR << "Expected samples for " << mProps.frameLength << "ms at " 
+			          << mProps.sampleRate << "Hz: " 
+			          << (mProps.sampleRate * mProps.frameLength / 1000);
+			LOG_ERROR << "Actual samples received: " << length;
+		}
+		
+		mFrameCount++;
+		
+		return result;
+	}
+	
+	void setProps(VoiceActivityDetectorProps& props)
+	{
+		mProps = props;
+	}
+
+public:
+	// Detail class definition
+	framemetadata_sp mAudioMetadata;
+	framemetadata_sp mVadMetadata;
+	std::string mAudioPinId;
+	std::string mVadPinId;
+	VoiceActivityDetectorProps mProps;
+
+private:
+	Fvad* mVoiceDetector; 
+	size_t mFrameCount;  
+};
+
+VoiceActivityDetector::VoiceActivityDetector(VoiceActivityDetectorProps _props) 
+	: Module(TRANSFORM, "VoiceActivityDetector", _props)
+{
+	mDetail.reset(new Detail(_props));
+}
+
+VoiceActivityDetector::~VoiceActivityDetector() {}
+
+bool VoiceActivityDetector::validateInputPins()
+{
+	if (getNumberOfInputPins() != 1) {
+		LOG_ERROR << "<" << getId() << ">::validateInputPins expects 1 input. Actual: " << getNumberOfInputPins();
+		return false;
+	}
+	
+	framemetadata_sp metadata = getFirstInputMetadata();
+	FrameMetadata::FrameType frameType = metadata->getFrameType();
+	
+	if (frameType != FrameMetadata::AUDIO) {
+		LOG_ERROR << "<" << getId() << ">::validateInputPins expects AUDIO input. Actual: " << frameType;
+		return false;
+	}
+	
+	FrameMetadata::MemType memType = metadata->getMemType();
+	if (memType != FrameMetadata::MemType::HOST) {
+		LOG_ERROR << "<" << getId() << ">::validateInputPins expects HOST memory. Actual: " << memType;
+		return false;
+	}
+	
+	return true;
+}
+
+bool VoiceActivityDetector::validateOutputPins()
+{
+	if (getNumberOfOutputPins() > 2) {
+		LOG_ERROR << "<" << getId() << ">::validateOutputPins expects <= 2 outputs. Actual: " << getNumberOfOutputPins();
+		return false;
+	}
+	
+	return true;
+}
+
+void VoiceActivityDetector::addInputPin(framemetadata_sp& metadata, string& pinId)
+{
+	Module::addInputPin(metadata, pinId);
+	
+	// Pin 1: Audio Passthrough
+	mDetail->mAudioMetadata = framemetadata_sp(new FrameMetadata(FrameMetadata::FrameType::AUDIO, FrameMetadata::MemType::HOST));
+	mDetail->mAudioMetadata->copyHint(*metadata.get());
+	mDetail->mAudioPinId = addOutputPin(mDetail->mAudioMetadata);
+
+	// Pin 2: VAD Result
+	mDetail->mVadMetadata = framemetadata_sp(new FrameMetadata(FrameMetadata::FrameType::GENERAL, FrameMetadata::MemType::HOST));
+	mDetail->mVadPinId = addOutputPin(mDetail->mVadMetadata);
+}
+
+bool VoiceActivityDetector::init()
+{
+	if (!Module::init()) {
+		return false;
+	}
+	
+	return mDetail->init();
+}
+
+bool VoiceActivityDetector::term()
+{
+	return Module::term();
+}
+
+bool VoiceActivityDetector::process(frame_container& frames)
+{
+	// 1. Get input audio frame
+	auto frame = frames.begin()->second;
+	int16_t* samples = static_cast<int16_t*>(frame->data());
+	size_t sampleCount = frame->size() / 2;  // Int16 = 2 bytes
+	
+	// 2. Call libfvad
+	int result = mDetail->processAudio(samples, sampleCount);
+	
+	// 3. Audio Passthrough
+	frames.insert(make_pair(mDetail->mAudioPinId, frame));
+
+	// 4. Create output frame with VAD result
+	auto outFrame = makeFrame(sizeof(int), mDetail->mVadPinId);
+	int vadResult = (result == 1) ? 1 : 0;  
+	memcpy(outFrame->data(), &vadResult, sizeof(int));
+	
+	// 5. Send output
+	frames.insert(make_pair(mDetail->mVadPinId, outFrame));
+	send(frames);
+	
+	return true;
+}
+
+bool VoiceActivityDetector::processSOS(frame_sp& frame)
+{
+	auto metadata = frame->getMetadata();
+	setMetadata(metadata);
+	return true;
+}
+
+void VoiceActivityDetector::setMetadata(framemetadata_sp& metadata)
+{
+	if (!metadata->isSet()) {
+		return;
+	}
+}
+
+VoiceActivityDetectorProps VoiceActivityDetector::getProps()
+{
+	fillProps(mDetail->mProps);
+	return mDetail->mProps;
+}
+
+void VoiceActivityDetector::setProps(VoiceActivityDetectorProps& props)
+{
+	Module::addPropsToQueue(props);
+}
+
+bool VoiceActivityDetector::handlePropsChange(frame_sp& frame)
+{
+	auto ret = Module::handlePropsChange(frame, mDetail->mProps);
+	mDetail->setProps(mDetail->mProps);
+	
+	mDetail->init();
+	
+	return ret;
+}
+
+bool VoiceActivityDetector::processEOS(string& pinId)
+{
+	return true;
+}
diff --git a/base/test/VoiceActivityDetectorTests.cpp b/base/test/VoiceActivityDetectorTests.cpp
new file mode 100644
index 000000000..f121371d3
--- /dev/null
+++ b/base/test/VoiceActivityDetectorTests.cpp
@@ -0,0 +1,191 @@
+#include "stdafx.h"
+#include <boost/test/unit_test.hpp>
+#include <fstream>
+#include "VoiceActivityDetector.h"
+#include "AudioCaptureSrc.h"
+#include "FileWriterModule.h"
+#include "PipeLine.h"
+#include "Logger.h"
+#include "test_utils.h"
+
+BOOST_AUTO_TEST_SUITE(voice_activity_detector_tests)
+
+BOOST_AUTO_TEST_CASE(voice_activity_detector_basic_test)
+{
+	Logger::setLogLevel(boost::log::trivial::severity_level::info);
+	LOG_INFO << "Starting Voice Activity Detector Basic Test...";
+
+	int sampleRate = 16000;
+	int channels = 1;  
+	int processingInterval = 10;  
+	
+	AudioCaptureSrcProps audioProps(sampleRate, channels, 0, processingInterval);
+	auto audioSrc = boost::shared_ptr<AudioCaptureSrc>(new AudioCaptureSrc(audioProps));
+
+	VoiceActivityDetectorProps vadProps(
+		16000,  
+		VoiceActivityDetectorProps::VERY_AGGRESSIVE, 
+		VoiceActivityDetectorProps::FRAME_10MS  
+	);
+	auto vad = boost::shared_ptr<VoiceActivityDetector>(new VoiceActivityDetector(vadProps));
+
+	// Sink 1: Audio Passthrough
+	std::string fileAudio = "./data/basic_test_audio.raw";
+	FileWriterModuleProps sinkPropsAudio(fileAudio, true);
+	auto sinkAudio = boost::shared_ptr<FileWriterModule>(new FileWriterModule(sinkPropsAudio));
+
+	// Sink 2: VAD Result
+	std::string fileVad = "./data/basic_test_vad.raw";
+	FileWriterModuleProps sinkPropsVad(fileVad, true);
+	auto sinkVad = boost::shared_ptr<FileWriterModule>(new FileWriterModule(sinkPropsVad));
+
+	audioSrc->setNext(vad);
+	
+	auto audioPins = vad->getAllOutputPinsByType(FrameMetadata::FrameType::AUDIO);
+	auto vadPins = vad->getAllOutputPinsByType(FrameMetadata::FrameType::GENERAL);
+
+	vad->setNext(sinkAudio, audioPins);
+	vad->setNext(sinkVad, vadPins);
+
+	PipeLine p("VoiceActivityDetectorTestPipeline");
+	p.appendModule(audioSrc);
+
+	LOG_INFO << "Initializing Pipeline...";
+	BOOST_TEST(p.init());
+
+	LOG_INFO << "Running for 10 seconds...";
+	LOG_INFO << "First 5 seconds: STAY SILENT (expect mostly 0s)";
+	LOG_INFO << "Last 5 seconds: SPEAK INTO MIC (expect mostly 1s)";
+	
+	p.run_all_threaded();
+	boost::this_thread::sleep_for(boost::chrono::seconds(10));
+
+	LOG_INFO << "Stopping Pipeline...";
+	p.stop();
+	p.term();
+	p.wait_for_all();
+}
+
+BOOST_AUTO_TEST_CASE(voice_activity_detector_aggressiveness_test)
+{
+	Logger::setLogLevel(boost::log::trivial::severity_level::info);
+	LOG_INFO << "Testing different aggressiveness modes...";
+
+	// Test all 4 aggressiveness modes
+	VoiceActivityDetectorProps::AggressivenessMode modes[] = {
+		VoiceActivityDetectorProps::QUALITY,
+		VoiceActivityDetectorProps::LOW_BITRATE,
+		VoiceActivityDetectorProps::AGGRESSIVE,
+		VoiceActivityDetectorProps::VERY_AGGRESSIVE
+	};
+
+	for (int i = 0; i < 4; i++) {
+		LOG_INFO << "Testing mode " << i << "...";
+		
+		AudioCaptureSrcProps audioProps(16000, 1, 0, 10);
+		auto source = boost::shared_ptr<AudioCaptureSrc>(new AudioCaptureSrc(audioProps));
+
+		VoiceActivityDetectorProps props(16000, modes[i], VoiceActivityDetectorProps::FRAME_10MS);
+		auto vad = boost::shared_ptr<VoiceActivityDetector>(new VoiceActivityDetector(props));
+		
+		source->setNext(vad);
+
+		BOOST_TEST(source->init());
+		BOOST_TEST(vad->init());
+
+		// Verify the mode was set correctly
+		auto currentProps = vad->getProps();
+		BOOST_TEST(currentProps.mode == modes[i]);
+
+		vad->term(); 
+		source->term();
+	}
+	
+	LOG_INFO << "All aggressiveness modes initialized successfully";
+}
+
+BOOST_AUTO_TEST_CASE(voice_activity_detector_props_test)
+{
+	Logger::setLogLevel(boost::log::trivial::severity_level::info);
+	LOG_INFO << "Testing getProps and setProps...";
+
+	// 1. Setup helper source to satisfy input pin requirement
+	AudioCaptureSrcProps audioProps(16000, 1, 0, 10);
+	auto source = boost::shared_ptr<AudioCaptureSrc>(new AudioCaptureSrc(audioProps));
+
+	VoiceActivityDetectorProps initialProps(
+		16000,
+		VoiceActivityDetectorProps::QUALITY,
+		VoiceActivityDetectorProps::FRAME_10MS
+	);
+	auto vad = boost::shared_ptr<VoiceActivityDetector>(new VoiceActivityDetector(initialProps));
+
+	// Connect source to vad so validation passes (expects 1 input pin)
+	source->setNext(vad);
+
+	// Create two sinks for the two output pins (Audio, VAD)
+	std::string filenameAudio = "./data/test_audio.raw";
+	FileWriterModuleProps sinkPropsAudio(filenameAudio, true);
+	auto sinkAudio = boost::shared_ptr<FileWriterModule>(new FileWriterModule(sinkPropsAudio));
+
+	std::string filenameVad = "./data/test_vad.raw";
+	FileWriterModuleProps sinkPropsVad(filenameVad, true);
+	auto sinkVad = boost::shared_ptr<FileWriterModule>(new FileWriterModule(sinkPropsVad));
+
+	// Connect VAD outputs
+	// This ensures both output pins are connected
+	vad->setNext(sinkAudio); 
+	vad->setNext(sinkVad);
+
+	// Initialize
+	if (!source->init()) 
+	{
+		BOOST_ERROR("Source init failed");
+		return;
+	}
+	if (!sinkAudio->init())
+	{
+		BOOST_ERROR("SinkAudio init failed");
+		return;
+	}
+	if (!sinkVad->init())
+	{
+		BOOST_ERROR("SinkVad init failed");
+		return;
+	}
+	if (!vad->init())
+	{
+		BOOST_ERROR("VAD init failed");
+		return;
+	}
+
+	// Verify initial props
+	auto currentProps = vad->getProps();
+	BOOST_TEST(currentProps.sampleRate == 16000);
+	BOOST_TEST(currentProps.mode == VoiceActivityDetectorProps::QUALITY);
+	BOOST_TEST(currentProps.frameLength == VoiceActivityDetectorProps::FRAME_10MS);
+
+	// Change props
+	VoiceActivityDetectorProps newProps(
+		16000,
+		VoiceActivityDetectorProps::VERY_AGGRESSIVE,
+		VoiceActivityDetectorProps::FRAME_20MS
+	);
+	
+	// setProps is async - it queues a command
+	vad->setProps(newProps);
+
+	vad->step(); 
+
+	// Verify updated props
+	currentProps = vad->getProps();
+	BOOST_TEST(currentProps.mode == VoiceActivityDetectorProps::VERY_AGGRESSIVE);
+	BOOST_TEST(currentProps.frameLength == VoiceActivityDetectorProps::FRAME_20MS);
+	
+	vad->term();
+	source->term();
+	sinkAudio->term();
+	sinkVad->term();
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/base/test/view_voice_activity_detector_output.py b/base/test/view_voice_activity_detector_output.py
new file mode 100644
index 000000000..84f53acd3
--- /dev/null
+++ b/base/test/view_voice_activity_detector_output.py
@@ -0,0 +1,71 @@
+import struct
+import sys
+import os
+
+def visualize_vad(filename):
+    if not os.path.exists(filename):
+        print(f"Error: File '{filename}' not found.")
+        return
+
+    print(f"Reading VAD output from: {filename}")
+    
+    with open(filename, 'rb') as f:
+        data = f.read()
+
+   
+    num_samples = len(data) // 4
+    print(f"Total Frames: {num_samples}")
+    
+    valid_bytes_len = num_samples * 4
+    if len(data) != valid_bytes_len:
+        print(f"Warning: File size {len(data)} is not a multiple of 4. Truncating {len(data) - valid_bytes_len} bytes.")
+        data = data[:valid_bytes_len]
+
+   
+    values = struct.unpack(f'<{num_samples}i', data)
+
+
+    zeros = values.count(0)
+    ones = values.count(1)
+    
+    print("-" * 40)
+    print(f"Silence Frames (0): {zeros}")
+    print(f"Speech Frames  (1): {ones}")
+    if num_samples > 0:
+        print(f"Speech Activity:    {(ones / num_samples) * 100:.2f}%")
+    print("-" * 40)
+
+    chunk_size = 10 
+    print(f"\nTimeline (each char = {chunk_size} frames = {chunk_size*10}ms):")
+    print("Legend: '_' = Silence, '#' = Speech, '.' = Mixed")
+    
+    timeline = ""
+    for i in range(0, num_samples, chunk_size):
+        chunk = values[i:i+chunk_size]
+        chunk_sum = sum(chunk)
+        
+        if chunk_sum == 0:
+            timeline += "_"
+        elif chunk_sum == len(chunk):
+            timeline += "#"
+        else:
+            timeline += "."
+            
+        if len(timeline) >= 64:
+            print(timeline)
+            timeline = ""
+            
+    if timeline:
+        print(timeline)
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python view_vad.py <path_to_raw_file>")
+        print("Example: python view_vad.py vad_output.raw")
+        
+        default_path = "data/vad_output.raw"
+        if os.path.exists(default_path):
+            print(f"\nNo file specified. Found default: {default_path}")
+            visualize_vad(default_path)
+    else:
+        visualize_vad(sys.argv[1])
diff --git a/base/vcpkg.json b/base/vcpkg.json
index 02da3473d..87cc38cfa 100644
--- a/base/vcpkg.json
+++ b/base/vcpkg.json
@@ -14,20 +14,11 @@
     }
   ],
   "dependencies": [
-    {
-      "name": "whisper",
-      "default-features": false,
-      "features": [
-        "cuda"
-      ]
-    },
     {
       "name": "opencv4",
       "default-features": false,
       "features": [
         "contrib",
-        "cuda",
-        "cudnn",
         "dnn",
         "jpeg",
         "nonfree",
@@ -61,6 +52,8 @@
     "zlib",
     "sfml",
     "brotli",
+    "whisper",
+    "libfvad",
     {
       "name": "gtk3",
       "platform": "!windows"
@@ -99,4 +92,4 @@
       "name": "libmp4"
     }
   ]
-}
+}
\ No newline at end of file
diff --git a/thirdparty/custom-overlay/libfvad/portfile.cmake b/thirdparty/custom-overlay/libfvad/portfile.cmake
new file mode 100644
index 000000000..cba47d067
--- /dev/null
+++ b/thirdparty/custom-overlay/libfvad/portfile.cmake
@@ -0,0 +1,17 @@
+vcpkg_from_github(
+    OUT_SOURCE_PATH SOURCE_PATH
+    REPO dpirch/libfvad
+    REF 532ab666c20d3cfda38bca63abbb0f152706c369
+    SHA512 926fb7155aae7a4ca6caf8e31a06e96125f8becda45bbb1218b2d2941b4ebf4e90d8552718e497b80a90d21a6813165d5e217cc354919eea4f2297d89226ed86
+    HEAD_REF master
+)
+
+vcpkg_cmake_configure(
+    SOURCE_PATH "${SOURCE_PATH}"
+)
+
+vcpkg_cmake_install()
+vcpkg_copy_pdbs()
+
+
+file(INSTALL "${SOURCE_PATH}/LICENSE" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}" RENAME copyright)
diff --git a/thirdparty/custom-overlay/libfvad/vcpkg.json b/thirdparty/custom-overlay/libfvad/vcpkg.json
new file mode 100644
index 000000000..b1bde443c
--- /dev/null
+++ b/thirdparty/custom-overlay/libfvad/vcpkg.json
@@ -0,0 +1,16 @@
+{
+    "name": "libfvad",
+    "version": "1.0.1",
+    "description": "Voice activity detection (VAD) library, based on WebRTC's VAD engine.",
+    "homepage": "https://github.com/dpirch/libfvad",
+    "dependencies": [
+        {
+            "name": "vcpkg-cmake",
+            "host": true
+        },
+        {
+            "name": "vcpkg-cmake-config",
+            "host": true
+        }
+    ]
+}
\ No newline at end of file

From 85781d9f803ae01129bc30acab2ca243c0878188 Mon Sep 17 00:00:00 2001
From: Srishti Karanth <srishtik@apra.in>
Date: Mon, 22 Dec 2025 13:25:17 +0530
Subject: [PATCH 4/4] added the speechOnlyMode bool to get the .raw file with
 only speeh false:audio has booth speech and silence true:audio has only
 speech

---
 base/include/VoiceActivityDetector.h     |  4 +-
 base/src/VoiceActivityDetector.cpp       | 30 +++++++++---
 base/test/VoiceActivityDetectorTests.cpp | 58 ++++++++++++++++++------
 3 files changed, 70 insertions(+), 22 deletions(-)

diff --git a/base/include/VoiceActivityDetector.h b/base/include/VoiceActivityDetector.h
index a64a71463..f7c8e7d64 100644
--- a/base/include/VoiceActivityDetector.h
+++ b/base/include/VoiceActivityDetector.h
@@ -26,12 +26,14 @@ class VoiceActivityDetectorProps : public ModuleProps
 	VoiceActivityDetectorProps(
 		int _sampleRate = 16000,
 		AggressivenessMode _mode = QUALITY,
-		FrameLength _frameLength = FRAME_10MS
+		FrameLength _frameLength = FRAME_10MS,
+		bool _speechOnly = false  // When true, only outputs audio when speech is detected
 	);
 	
 	int sampleRate;
 	AggressivenessMode mode;
 	FrameLength frameLength;
+	bool speechOnly;  // If true, audio passthrough only sends frames with speech
 	
 	size_t getSerializeSize();
 
diff --git a/base/src/VoiceActivityDetector.cpp b/base/src/VoiceActivityDetector.cpp
index 06b0f6d7d..b0dc1b383 100644
--- a/base/src/VoiceActivityDetector.cpp
+++ b/base/src/VoiceActivityDetector.cpp
@@ -10,17 +10,20 @@
 VoiceActivityDetectorProps::VoiceActivityDetectorProps(
 	int _sampleRate,
 	AggressivenessMode _mode,
-	FrameLength _frameLength
+	FrameLength _frameLength,
+	bool _speechOnly
 ) : sampleRate(_sampleRate),
 	mode(_mode),
-	frameLength(_frameLength)
+	frameLength(_frameLength),
+	speechOnly(_speechOnly)
 {}
 
 size_t VoiceActivityDetectorProps::getSerializeSize() {
 	return ModuleProps::getSerializeSize() +
 		sizeof(sampleRate) +
 		sizeof(mode) +
-		sizeof(frameLength);
+		sizeof(frameLength) +
+		sizeof(speechOnly);
 }
 
 template <class Archive>
@@ -29,6 +32,7 @@ void VoiceActivityDetectorProps::serialize(Archive& ar, const unsigned int versi
 	ar& sampleRate;
 	ar& mode;
 	ar& frameLength;
+	ar& speechOnly;
 }
 
 class VoiceActivityDetector::Detail
@@ -72,6 +76,7 @@ class VoiceActivityDetector::Detail
 		LOG_INFO << "Sample Rate: " << mProps.sampleRate << " Hz";
 		LOG_INFO << "Aggressiveness Mode: " << mProps.mode << " (0=Quality, 3=Very Aggressive)";
 		LOG_INFO << "Frame Length: " << mProps.frameLength << " ms";
+		LOG_INFO << "Speech Only Mode: " << (mProps.speechOnly ? "ON" : "OFF");
 		LOG_INFO << "Expected samples per frame: " << (mProps.sampleRate * mProps.frameLength / 1000);
 		LOG_INFO << "========================";
 		
@@ -198,13 +203,24 @@ bool VoiceActivityDetector::process(frame_container& frames)
 	
 	// 2. Call libfvad
 	int result = mDetail->processAudio(samples, sampleCount);
+	bool isSpeech = (result == 1);
 	
-	// 3. Audio Passthrough
-	frames.insert(make_pair(mDetail->mAudioPinId, frame));
+	// 3. Audio Passthrough - Conditional on speechOnly mode
+	// speechOnly - 
+	// false:audio has booth speech and silence
+	// true:audio has only speech
+	if (!mDetail->mProps.speechOnly ) {
+		frames.insert(make_pair(mDetail->mAudioPinId, frame));
+	}
+	else {
+		if(isSpeech) {
+			frames.insert(make_pair(mDetail->mAudioPinId, frame));
+		}
+	}
 
-	// 4. Create output frame with VAD result
+	// 4. Create output frame with VAD result - ALWAYS create and send
 	auto outFrame = makeFrame(sizeof(int), mDetail->mVadPinId);
-	int vadResult = (result == 1) ? 1 : 0;  
+	int vadResult = isSpeech ? 1 : 0;  
 	memcpy(outFrame->data(), &vadResult, sizeof(int));
 	
 	// 5. Send output
diff --git a/base/test/VoiceActivityDetectorTests.cpp b/base/test/VoiceActivityDetectorTests.cpp
index f121371d3..b623bf30c 100644
--- a/base/test/VoiceActivityDetectorTests.cpp
+++ b/base/test/VoiceActivityDetectorTests.cpp
@@ -10,52 +10,62 @@
 
 BOOST_AUTO_TEST_SUITE(voice_activity_detector_tests)
 
-BOOST_AUTO_TEST_CASE(voice_activity_detector_basic_test)
+// TEST: Speech-only output using speechOnly mode
+BOOST_AUTO_TEST_CASE(voice_activity_filter_speech_only_test)
 {
 	Logger::setLogLevel(boost::log::trivial::severity_level::info);
-	LOG_INFO << "Starting Voice Activity Detector Basic Test...";
+	LOG_INFO << "Starting Speech-Only Mode Test...";
+	LOG_INFO << "This test saves BOTH output pins for comparison";
 
 	int sampleRate = 16000;
 	int channels = 1;  
 	int processingInterval = 10;  
 	
+	// Audio source
 	AudioCaptureSrcProps audioProps(sampleRate, channels, 0, processingInterval);
 	auto audioSrc = boost::shared_ptr<AudioCaptureSrc>(new AudioCaptureSrc(audioProps));
 
+	// CHANGE THIS to true/false to test different modes
+	bool speechOnlyMode = true;  
+	
 	VoiceActivityDetectorProps vadProps(
 		16000,  
 		VoiceActivityDetectorProps::VERY_AGGRESSIVE, 
-		VoiceActivityDetectorProps::FRAME_10MS  
+		VoiceActivityDetectorProps::FRAME_10MS,
+		speechOnlyMode 
 	);
 	auto vad = boost::shared_ptr<VoiceActivityDetector>(new VoiceActivityDetector(vadProps));
 
-	// Sink 1: Audio Passthrough
-	std::string fileAudio = "./data/basic_test_audio.raw";
+	// Sink 1: Audio output
+	std::string fileAudio = speechOnlyMode ? "./data/speech_only_audio.raw" : "./data/all_audio.raw";
 	FileWriterModuleProps sinkPropsAudio(fileAudio, true);
 	auto sinkAudio = boost::shared_ptr<FileWriterModule>(new FileWriterModule(sinkPropsAudio));
 
-	// Sink 2: VAD Result
-	std::string fileVad = "./data/basic_test_vad.raw";
+	// Sink 2: VAD results output
+	std::string fileVad = "./data/vad.raw";
 	FileWriterModuleProps sinkPropsVad(fileVad, true);
 	auto sinkVad = boost::shared_ptr<FileWriterModule>(new FileWriterModule(sinkPropsVad));
 
+	// Build pipeline: AudioSrc -> VAD -> Two FileWriters
 	audioSrc->setNext(vad);
 	
+	// Connect BOTH output pins
 	auto audioPins = vad->getAllOutputPinsByType(FrameMetadata::FrameType::AUDIO);
 	auto vadPins = vad->getAllOutputPinsByType(FrameMetadata::FrameType::GENERAL);
+	
+	vad->setNext(sinkAudio, audioPins);  // Pin 1: Audio → sinkAudio
+	vad->setNext(sinkVad, vadPins);      // Pin 2: VAD → sinkVad
 
-	vad->setNext(sinkAudio, audioPins);
-	vad->setNext(sinkVad, vadPins);
-
-	PipeLine p("VoiceActivityDetectorTestPipeline");
+	PipeLine p("SpeechOnlyTestPipeline");
 	p.appendModule(audioSrc);
 
 	LOG_INFO << "Initializing Pipeline...";
 	BOOST_TEST(p.init());
 
-	LOG_INFO << "Running for 10 seconds...";
-	LOG_INFO << "First 5 seconds: STAY SILENT (expect mostly 0s)";
-	LOG_INFO << "Last 5 seconds: SPEAK INTO MIC (expect mostly 1s)";
+	LOG_INFO << "===========================================";
+	LOG_INFO << "speechOnly mode: " << (speechOnlyMode ? "TRUE" : "FALSE");
+	LOG_INFO << "Recording for 10 seconds...";
+	LOG_INFO << "===========================================";
 	
 	p.run_all_threaded();
 	boost::this_thread::sleep_for(boost::chrono::seconds(10));
@@ -64,6 +74,25 @@ BOOST_AUTO_TEST_CASE(voice_activity_detector_basic_test)
 	p.stop();
 	p.term();
 	p.wait_for_all();
+
+	LOG_INFO << "";
+	LOG_INFO << "===========================================";
+	LOG_INFO << "Test complete! Files saved:";
+	LOG_INFO << "  Audio output: " << fileAudio;
+	LOG_INFO << "  VAD output:   " << fileVad;
+	LOG_INFO << "";
+	LOG_INFO << "To play audio in Audacity:";
+	LOG_INFO << "  1.File > Import > Raw Data";
+	LOG_INFO << "  2. Select: " << fileAudio;
+	LOG_INFO << "  3. Settings:";
+	LOG_INFO << "     - Encoding: Signed 16-bit PCM";
+	LOG_INFO << "     - Byte order: Little-endian";
+	LOG_INFO << "     - Channels: 1 (Mono)";
+	LOG_INFO << "     - Sample rate: 16000 Hz";
+	LOG_INFO << "";
+	LOG_INFO << "To view VAD results:";
+	LOG_INFO << "  python view_voice_activity_detector_output.py " << fileVad;
+	LOG_INFO << "===========================================";
 }
 
 BOOST_AUTO_TEST_CASE(voice_activity_detector_aggressiveness_test)
@@ -189,3 +218,4 @@ BOOST_AUTO_TEST_CASE(voice_activity_detector_props_test)
 }
 
 BOOST_AUTO_TEST_SUITE_END()
+