Apra-Labs · srikaranth · Dec 11, 2025 · Dec 11, 2025 · Dec 18, 2025 · Dec 22, 2025
diff --git a/base/CMakeLists.txt b/base/CMakeLists.txt
@@ -50,6 +50,8 @@ find_package(ZXing CONFIG REQUIRED)
 find_package(bigint CONFIG REQUIRED)
 find_package(SFML COMPONENTS system window audio graphics CONFIG REQUIRED)
 find_package(whisper CONFIG REQUIRED)
+find_path(FVAD_INCLUDE_DIR fvad.h)
+find_library(FVAD_LIBRARY NAMES fvad libfvad)
 
 
 IF(ENABLE_LINUX)
@@ -165,6 +167,9 @@ ENDIF(ENABLE_CUDA)
 
 include_directories(AFTER SYSTEM include)
 
+add_subdirectory(../thirdparty/libfvad libfvad)
+include_directories(AFTER SYSTEM ../thirdparty/libfvad/include)
+
 # ApraPipes library
 
 SET(CORE_FILES
@@ -339,6 +344,7 @@ SET(IP_FILES
 	src/OverlayFactory.cpp
 	src/TestSignalGeneratorSrc.cpp
 	src/AudioToTextXForm.cpp 
+	src/VoiceActivityDetector.cpp
 	src/AbsControlModule.cpp
 	src/ThumbnailListGenerator.cpp
 )
@@ -366,6 +372,7 @@ SET(IP_FILES_H
 	include/ColorConversionXForm.h
 	include/Overlay.h
 	include/AudioToTextXForm.h
+	include/VoiceActivityDetector.h
 	include/ThumbnailListGenerator.h
 )
 
@@ -523,6 +530,7 @@ target_include_directories ( aprapipes PRIVATE
 	${BARESIP_INC_DIR}
 	${LIBRE_INC_DIR}
 	${NVCODEC_INCLUDE_DIR}
+	${FVAD_INCLUDE_DIR}
 )
 
 
@@ -627,6 +635,7 @@ SET(UT_FILES
 	test/mp4_dts_strategy_tests.cpp
 	test/overlaymodule_tests.cpp
 	test/testSignalGeneratorSrc_tests.cpp
+	test/VoiceActivityDetectorTests.cpp
 	test/audioToTextXform_tests.cpp
 	test/simpleControlModuleTests.cpp
 	${ARM64_UT_FILES}
@@ -692,6 +701,7 @@ target_link_libraries(aprapipesut
   bigint::bigint
   sfml-audio
   whisper::whisper
+  ${FVAD_LIBRARY}
   )
 
 IF(ENABLE_WINDOWS)

diff --git a/base/include/VADTransform.h b/base/include/VADTransform.h
@@ -0,0 +1,64 @@
+#pragma once
+
+#include "Module.h"
+
+class VADTransformProps : public ModuleProps
+{
+public:
+	enum AggressivenessMode {
+		QUALITY = 0,          // Least aggressive (best quality, catches more speech)
+		LOW_BITRATE = 1,      // Moderate
+		AGGRESSIVE = 2,       // More aggressive
+		VERY_AGGRESSIVE = 3   // Most aggressive (best bandwidth saving, only clear speech)
+	};
+
+	enum FrameLength {
+		FRAME_10MS = 10,
+		FRAME_20MS = 20,
+		FRAME_30MS = 30
+	};
+
+	VADTransformProps(
+		int _sampleRate = 16000,
+		AggressivenessMode _mode = QUALITY,
+		FrameLength _frameLength = FRAME_10MS
+	);
+
+	int sampleRate;
+	AggressivenessMode mode;
+	FrameLength frameLength;
+
+	size_t getSerializeSize();
+
+private:
+	friend class boost::serialization::access;
+
+	template <class Archive>
+	void serialize(Archive& ar, const unsigned int version);
+};
+
+class VADTransform : public Module
+{
+public:
+	VADTransform(VADTransformProps _props);
+	virtual ~VADTransform();
+
+	bool init();
+	bool term();
+	void setProps(VADTransformProps& props);
+	VADTransformProps getProps();
+
+protected:
+	bool process(frame_container& frames);
+	bool processSOS(frame_sp& frame);
+	bool validateInputPins();
+	bool validateOutputPins();
+	void addInputPin(framemetadata_sp& metadata, string& pinId);
+	bool handlePropsChange(frame_sp& frame);
+	bool processEOS(string& pinId);
+
+private:
+	void setMetadata(framemetadata_sp& metadata);
+	class Detail;
+	boost::shared_ptr<Detail> mDetail;
+};
diff --git a/base/include/VoiceActivityDetector.h b/base/include/VoiceActivityDetector.h
@@ -0,0 +1,71 @@
+#pragma once
+
+#include "Module.h"
+
+class VoiceActivityDetectorProps : public ModuleProps
+{
+public:
+
+	//decides speech vs no-speech
+	enum AggressivenessMode {
+		QUALITY = 0,          // Least aggressive (best quality, catches more speech)
+		LOW_BITRATE = 1,      // Moderate
+		AGGRESSIVE = 2,       // More aggressive
+		VERY_AGGRESSIVE = 3   // Most aggressive (best bandwidth saving, only clear speech)
+	};
+
+	// audio time the VAD analyzes at once
+	// default sample rate is 16 kHz (16,000 samples/sec)
+	// 10ms - 160 samples  20ms - 320 samples  30ms - 480 samples
+	enum FrameLength {
+		FRAME_10MS = 10,      // Lowest latency
+		FRAME_20MS = 20,      // Balanced
+		FRAME_30MS = 30       // Best accuracy
+	};
+
+	VoiceActivityDetectorProps(
+		int _sampleRate = 16000,
+		AggressivenessMode _mode = QUALITY,
+		FrameLength _frameLength = FRAME_10MS,
+		bool _speechOnly = false  // When true, only outputs audio when speech is detected
+	);
+
+	int sampleRate;
+	AggressivenessMode mode;
+	FrameLength frameLength;
+	bool speechOnly;  // If true, audio passthrough only sends frames with speech
+
+	size_t getSerializeSize();
+
+private:
+	friend class boost::serialization::access;
+
+	template <class Archive>
+	void serialize(Archive& ar, const unsigned int version);
+};
+
+class VoiceActivityDetector : public Module
+{
+public:
+	VoiceActivityDetector(VoiceActivityDetectorProps _props);
+	virtual ~VoiceActivityDetector();
+
+	bool init();
+	bool term();
+	void setProps(VoiceActivityDetectorProps& props);
+	VoiceActivityDetectorProps getProps();
+
+protected:
+	bool process(frame_container& frames);
+	bool processSOS(frame_sp& frame);
+	bool validateInputPins();
+	bool validateOutputPins();
+	void addInputPin(framemetadata_sp& metadata, string& pinId);
+	bool handlePropsChange(frame_sp& frame);
+	bool processEOS(string& pinId);
+
+private:
+	void setMetadata(framemetadata_sp& metadata);
+	class Detail;
+	boost::shared_ptr<Detail> mDetail;
+};