Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions base/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ find_package(ZXing CONFIG REQUIRED)
find_package(bigint CONFIG REQUIRED)
find_package(SFML COMPONENTS system window audio graphics CONFIG REQUIRED)
find_package(whisper CONFIG REQUIRED)
find_path(FVAD_INCLUDE_DIR fvad.h)
find_library(FVAD_LIBRARY NAMES fvad libfvad)


IF(ENABLE_LINUX)
Expand Down Expand Up @@ -165,6 +167,9 @@ ENDIF(ENABLE_CUDA)

include_directories(AFTER SYSTEM include)

add_subdirectory(../thirdparty/libfvad libfvad)
include_directories(AFTER SYSTEM ../thirdparty/libfvad/include)

# ApraPipes library

SET(CORE_FILES
Expand Down Expand Up @@ -339,6 +344,7 @@ SET(IP_FILES
src/OverlayFactory.cpp
src/TestSignalGeneratorSrc.cpp
src/AudioToTextXForm.cpp
src/VoiceActivityDetector.cpp
src/AbsControlModule.cpp
src/ThumbnailListGenerator.cpp
)
Expand Down Expand Up @@ -366,6 +372,7 @@ SET(IP_FILES_H
include/ColorConversionXForm.h
include/Overlay.h
include/AudioToTextXForm.h
include/VoiceActivityDetector.h
include/ThumbnailListGenerator.h
)

Expand Down Expand Up @@ -523,6 +530,7 @@ target_include_directories ( aprapipes PRIVATE
${BARESIP_INC_DIR}
${LIBRE_INC_DIR}
${NVCODEC_INCLUDE_DIR}
${FVAD_INCLUDE_DIR}
)


Expand Down Expand Up @@ -627,6 +635,7 @@ SET(UT_FILES
test/mp4_dts_strategy_tests.cpp
test/overlaymodule_tests.cpp
test/testSignalGeneratorSrc_tests.cpp
test/VoiceActivityDetectorTests.cpp
test/audioToTextXform_tests.cpp
test/simpleControlModuleTests.cpp
${ARM64_UT_FILES}
Expand Down Expand Up @@ -692,6 +701,7 @@ target_link_libraries(aprapipesut
bigint::bigint
sfml-audio
whisper::whisper
${FVAD_LIBRARY}
)

IF(ENABLE_WINDOWS)
Expand Down
64 changes: 64 additions & 0 deletions base/include/VADTransform.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#pragma once

#include "Module.h"

class VADTransformProps : public ModuleProps
{
public:
enum AggressivenessMode {
QUALITY = 0, // Least aggressive (best quality, catches more speech)
LOW_BITRATE = 1, // Moderate
AGGRESSIVE = 2, // More aggressive
VERY_AGGRESSIVE = 3 // Most aggressive (best bandwidth saving, only clear speech)
};

enum FrameLength {
FRAME_10MS = 10,
FRAME_20MS = 20,
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add comments to describe this modes, as in effects of setting frame length

FRAME_30MS = 30
};

VADTransformProps(
int _sampleRate = 16000,
AggressivenessMode _mode = QUALITY,
FrameLength _frameLength = FRAME_10MS
);

int sampleRate;
AggressivenessMode mode;
FrameLength frameLength;

size_t getSerializeSize();

private:
friend class boost::serialization::access;

template <class Archive>
void serialize(Archive& ar, const unsigned int version);
};

class VADTransform : public Module
{
public:
VADTransform(VADTransformProps _props);
virtual ~VADTransform();

bool init();
bool term();
void setProps(VADTransformProps& props);
VADTransformProps getProps();

protected:
bool process(frame_container& frames);
bool processSOS(frame_sp& frame);
bool validateInputPins();
bool validateOutputPins();
void addInputPin(framemetadata_sp& metadata, string& pinId);
bool handlePropsChange(frame_sp& frame);
bool processEOS(string& pinId);

private:
void setMetadata(framemetadata_sp& metadata);
class Detail;
boost::shared_ptr<Detail> mDetail;
};
71 changes: 71 additions & 0 deletions base/include/VoiceActivityDetector.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#pragma once

#include "Module.h"

class VoiceActivityDetectorProps : public ModuleProps
{
public:

//decides speech vs no-speech
enum AggressivenessMode {
QUALITY = 0, // Least aggressive (best quality, catches more speech)
LOW_BITRATE = 1, // Moderate
AGGRESSIVE = 2, // More aggressive
VERY_AGGRESSIVE = 3 // Most aggressive (best bandwidth saving, only clear speech)
};

// audio time the VAD analyzes at once
// default sample rate is 16 kHz (16,000 samples/sec)
// 10ms - 160 samples 20ms - 320 samples 30ms - 480 samples
enum FrameLength {
FRAME_10MS = 10, // Lowest latency
FRAME_20MS = 20, // Balanced
FRAME_30MS = 30 // Best accuracy
};

VoiceActivityDetectorProps(
int _sampleRate = 16000,
AggressivenessMode _mode = QUALITY,
FrameLength _frameLength = FRAME_10MS,
bool _speechOnly = false // When true, only outputs audio when speech is detected
);

int sampleRate;
AggressivenessMode mode;
FrameLength frameLength;
bool speechOnly; // If true, audio passthrough only sends frames with speech

size_t getSerializeSize();

private:
friend class boost::serialization::access;

template <class Archive>
void serialize(Archive& ar, const unsigned int version);
};

class VoiceActivityDetector : public Module
{
public:
VoiceActivityDetector(VoiceActivityDetectorProps _props);
virtual ~VoiceActivityDetector();

bool init();
bool term();
void setProps(VoiceActivityDetectorProps& props);
VoiceActivityDetectorProps getProps();

protected:
bool process(frame_container& frames);
bool processSOS(frame_sp& frame);
bool validateInputPins();
bool validateOutputPins();
void addInputPin(framemetadata_sp& metadata, string& pinId);
bool handlePropsChange(frame_sp& frame);
bool processEOS(string& pinId);

private:
void setMetadata(framemetadata_sp& metadata);
class Detail;
boost::shared_ptr<Detail> mDetail;
};
Loading
Loading