Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions base/src/main/java/ai/javaclaw/speech/SpeechToTextException.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
package ai.javaclaw.speech;

public class SpeechToTextException extends RuntimeException {

public SpeechToTextException(String message) {
super(message);
}

public SpeechToTextException(String message, Throwable cause) {
super(message, cause);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
package ai.javaclaw.speech;

import java.io.InputStream;

public interface SpeechToTextService {

String transcribe(InputStream audioStream);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
package ai.javaclaw.speech;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.stereotype.Service;

import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.concurrent.TimeUnit;

@Service
@ConditionalOnProperty(name = "speech.provider", havingValue = "whisper-cpp")
public class WhisperCppSpeechToTextService implements SpeechToTextService {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've been looking for a java library that does speech to text and I found vosk: https://github.com/alphacep/vosk-api. If it works, what do you think of making it the default @soufianebouaddis? We could also drop this implementation which requires having both ffmpeg and whisper-cli.


private static final Logger LOGGER = LoggerFactory.getLogger(WhisperCppSpeechToTextService.class);

private final String modelPath;

public WhisperCppSpeechToTextService(
@Value("${speech.whisper-cpp.model-path}") String modelPath) {
this.modelPath = modelPath;
}

@Override
public String transcribe(InputStream audioStream) {
LOGGER.info("Transcribing audio via whisper-cpp (model: {})", modelPath);

Path oggFile = null;
Path wavFile = null;
Path outputFile = null;

try {
oggFile = Files.createTempFile("whisper-input-", ".ogg");
Files.write(oggFile, audioStream.readAllBytes());

wavFile = Files.createTempFile("whisper-input-", ".wav");
convertOggToWav(oggFile, wavFile);

outputFile = Files.createTempFile("whisper-output-", ".txt");
Files.deleteIfExists(outputFile);

ProcessBuilder pb = new ProcessBuilder(
"whisper-cli",
"-m", modelPath,
"-f", wavFile.toString(),
"-otxt",
"-of", outputFile.toString().replace(".txt", ""),
"--no-prints"
);
pb.redirectErrorStream(true);

Process process = pb.start();
boolean finished = process.waitFor(60, TimeUnit.SECONDS);

if (!finished) {
process.destroyForcibly();
throw new SpeechToTextException("whisper-cli timed out after 60 seconds");
}

if (process.exitValue() != 0) {
String error = new String(process.getInputStream().readAllBytes());
throw new SpeechToTextException("whisper-cli exited with code " + process.exitValue() + ": " + error);
}

if (!Files.exists(outputFile)) {
throw new SpeechToTextException("whisper-cli did not produce output file");
}

String text = Files.readString(outputFile).trim();
if (text.isBlank()) {
throw new SpeechToTextException("whisper-cli returned empty transcription");
}

LOGGER.info("whisper-cpp transcription completed successfully");
return text;

} catch (IOException | InterruptedException e) {
if (e instanceof InterruptedException) {
Thread.currentThread().interrupt();
}
throw new SpeechToTextException("Failed to run whisper-cli", e);
} finally {
deleteSilently(oggFile);
deleteSilently(wavFile);
deleteSilently(outputFile);
}
}

private void convertOggToWav(Path oggFile, Path wavFile) throws IOException, InterruptedException {
ProcessBuilder pb = new ProcessBuilder(
"ffmpeg", "-y", "-i", oggFile.toString(), "-ar", "16000", "-ac", "1", wavFile.toString()
);
pb.redirectErrorStream(true);

Process process = pb.start();
boolean finished = process.waitFor(30, TimeUnit.SECONDS);

if (!finished) {
process.destroyForcibly();
throw new SpeechToTextException("ffmpeg conversion timed out");
}

if (process.exitValue() != 0) {
String error = new String(process.getInputStream().readAllBytes());
throw new SpeechToTextException("ffmpeg conversion failed: " + error);
}
}

private void deleteSilently(Path path) {
if (path != null) {
try {
Files.deleteIfExists(path);
} catch (IOException ignored) {
}
}
}
}
11 changes: 11 additions & 0 deletions base/src/test/java/ai/javaclaw/speech/MockSpeechToTextService.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
package ai.javaclaw.speech;

import java.io.InputStream;

public class MockSpeechToTextService implements SpeechToTextService {
Comment thread
auloin marked this conversation as resolved.

@Override
public String transcribe(InputStream audioStream) {
return "[voice message]";
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,11 @@
import ai.javaclaw.channels.Channel;
import ai.javaclaw.channels.ChannelMessageReceivedEvent;
import ai.javaclaw.channels.ChannelRegistry;
import ai.javaclaw.speech.SpeechToTextService;
import org.commonmark.ext.gfm.strikethrough.StrikethroughExtension;
import org.commonmark.node.Node;
import org.commonmark.parser.Parser;
import org.commonmark.renderer.html.HtmlRenderer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.telegram.telegrambots.client.okhttp.OkHttpTelegramClient;
Expand All @@ -17,11 +21,11 @@
import org.telegram.telegrambots.meta.api.objects.message.Message;
import org.telegram.telegrambots.meta.exceptions.TelegramApiException;
import org.telegram.telegrambots.meta.generics.TelegramClient;
import org.commonmark.node.Node;
import org.commonmark.parser.Parser;
import org.commonmark.renderer.html.HtmlRenderer;

import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import java.util.Optional;

import static java.util.Optional.ofNullable;

Expand All @@ -40,18 +44,26 @@ public class TelegramChannel implements Channel, SpringLongPollingBot, LongPolli
private final TelegramClient telegramClient;
private final Agent agent;
private final ChannelRegistry channelRegistry;
private final SpeechToTextService speechToTextService;
private final TelegramVoiceDownloader voiceDownloader;
private Long chatId;

public TelegramChannel(String botToken, String allowedUsername, Agent agent, ChannelRegistry channelRegistry) {
this(botToken, allowedUsername, new OkHttpTelegramClient(botToken), agent, channelRegistry);
public TelegramChannel(String botToken, String allowedUsername, Agent agent, ChannelRegistry channelRegistry, SpeechToTextService speechToTextService) {
this(botToken, allowedUsername, new OkHttpTelegramClient(botToken), agent, channelRegistry, speechToTextService);
}

TelegramChannel(String botToken, String allowedUsername, TelegramClient telegramClient, Agent agent, ChannelRegistry channelRegistry, SpeechToTextService speechToTextService) {
this(botToken, allowedUsername, telegramClient, agent, channelRegistry, speechToTextService, new TelegramVoiceDownloader(telegramClient, botToken));
}

TelegramChannel(String botToken, String allowedUsername, TelegramClient telegramClient, Agent agent, ChannelRegistry channelRegistry) {
TelegramChannel(String botToken, String allowedUsername, TelegramClient telegramClient, Agent agent, ChannelRegistry channelRegistry, SpeechToTextService speechToTextService, TelegramVoiceDownloader voiceDownloader) {
this.botToken = botToken;
this.allowedUsername = normalizeUsername(allowedUsername);
this.telegramClient = telegramClient;
this.agent = agent;
this.channelRegistry = channelRegistry;
this.speechToTextService = speechToTextService;
this.voiceDownloader = voiceDownloader;
channelRegistry.registerChannel(this);
LOGGER.info("Started Telegram integration");
}
Expand All @@ -68,7 +80,7 @@ public LongPollingUpdateConsumer getUpdatesConsumer() {

@Override
public void consume(Update update) {
if (!(update.hasMessage() && update.getMessage().hasText())) return;
if (!update.hasMessage()) return;

Message requestMessage = update.getMessage();
String userName = requestMessage.getFrom() == null ? null : requestMessage.getFrom().getUserName();
Expand All @@ -78,11 +90,13 @@ public void consume(Update update) {
return;
}

String messageText = requestMessage.getText();
Optional<String> messageText = resolveMessageText(requestMessage);
if (messageText.isEmpty()) return;

this.chatId = requestMessage.getChatId();
Integer messageThreadId = requestMessage.getMessageThreadId();
channelRegistry.publishMessageReceivedEvent(new TelegramChannelMessageReceivedEvent(getName(), messageText, chatId, messageThreadId));
String response = agent.respondTo(getConversationId(chatId, messageThreadId), messageText);
channelRegistry.publishMessageReceivedEvent(new TelegramChannelMessageReceivedEvent(getName(), messageText.get(), chatId, messageThreadId));
String response = agent.respondTo(getConversationId(chatId, messageThreadId), messageText.get());
sendMessage(chatId, messageThreadId, response);
}

Expand Down Expand Up @@ -124,6 +138,28 @@ public void sendMessage(long chatId, Integer messageThreadId, String message) {
}
}

private Optional<String> resolveMessageText(Message message) {
if (message.hasText()) {
return Optional.of(message.getText());
}
if (message.hasVoice()) {
return transcribeVoice(message);
}
return Optional.empty();
}

private Optional<String> transcribeVoice(Message message) {
LOGGER.info("Voice message received, downloading audio");
try (InputStream voiceStream = voiceDownloader.download(message)) {
String transcribed = speechToTextService.transcribe(voiceStream);
LOGGER.info("Voice message transcribed successfully");
return Optional.of(transcribed);
} catch (IOException | TelegramApiException e) {
LOGGER.error("Failed to process voice message", e);
return Optional.empty();
}
}

private String convertMarkdownToTelegramHtml(String markdown) {
if (markdown == null || markdown.isBlank()) return "";

Expand Down Expand Up @@ -181,4 +217,4 @@ public Integer getMessageThreadId() {
return messageThreadId;
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import ai.javaclaw.agent.Agent;
import ai.javaclaw.channels.ChannelRegistry;
import ai.javaclaw.speech.SpeechToTextService;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.boot.autoconfigure.AutoConfiguration;
import org.springframework.boot.autoconfigure.condition.ConditionalOnMissingBean;
Expand All @@ -19,7 +20,8 @@ public class TelegramChannelAutoConfiguration {
public TelegramChannel telegramChannel(@Value("${agent.channels.telegram.token:null}") String botToken,
@Value("${agent.channels.telegram.username:null}") String allowedUsername,
Agent agent,
ChannelRegistry channelRegistry) {
return new TelegramChannel(botToken, allowedUsername, agent, channelRegistry);
ChannelRegistry channelRegistry,
SpeechToTextService speechToTextService) {
return new TelegramChannel(botToken, allowedUsername, agent, channelRegistry, speechToTextService);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
package ai.javaclaw.channels.telegram;

import org.telegram.telegrambots.meta.api.methods.GetFile;
import org.telegram.telegrambots.meta.api.objects.message.Message;
import org.telegram.telegrambots.meta.exceptions.TelegramApiException;
import org.telegram.telegrambots.meta.generics.TelegramClient;

import java.io.IOException;
import java.io.InputStream;
import java.net.URI;

class TelegramVoiceDownloader {

private final TelegramClient telegramClient;
private final String botToken;

TelegramVoiceDownloader(TelegramClient telegramClient, String botToken) {
this.telegramClient = telegramClient;
this.botToken = botToken;
}

InputStream download(Message message) throws TelegramApiException, IOException {
String fileId = message.getVoice().getFileId();
GetFile getFile = new GetFile(fileId);
String filePath = telegramClient.execute(getFile).getFilePath();
String fileUrl = "https://api.telegram.org/file/bot" + botToken + "/" + filePath;
return URI.create(fileUrl).toURL().openStream();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import ai.javaclaw.agent.Agent;
import ai.javaclaw.channels.ChannelRegistry;
import ai.javaclaw.speech.SpeechToTextService;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.Mock;
Expand Down Expand Up @@ -32,6 +33,9 @@ class TelegramChannelTest {
@Mock
private Agent agent;

@Mock
private SpeechToTextService speechToTextService;

// -----------------------------------------------------------------------
// Ignored updates
// -----------------------------------------------------------------------
Expand All @@ -48,13 +52,17 @@ void ignoresUpdatesWithoutMessage() {
}

@Test
void ignoresUpdatesWithoutText() {
void ignoresUpdatesWithoutTextOrVoice() {
TelegramChannel channel = channel("allowed_user");
Update update = mock(Update.class);
Message message = mock(Message.class);
User user = mock(User.class);
when(update.hasMessage()).thenReturn(true);
when(update.getMessage()).thenReturn(message);
when(message.getFrom()).thenReturn(user);
when(user.getUserName()).thenReturn("allowed_user");
when(message.hasText()).thenReturn(false);
when(message.hasVoice()).thenReturn(false);

channel.consume(update);

Expand All @@ -68,7 +76,6 @@ void ignoresMessagesFromNullUsername() {
Message message = mock(Message.class);
when(update.hasMessage()).thenReturn(true);
when(update.getMessage()).thenReturn(message);
when(message.hasText()).thenReturn(true);
when(message.getFrom()).thenReturn(null);

channel.consume(update);
Expand Down Expand Up @@ -234,7 +241,7 @@ void sendMessageFallbacksToSendingRawTextWhenFailingToSendHtml() throws Telegram
// -----------------------------------------------------------------------

private TelegramChannel channel(String allowedUsername) {
return new TelegramChannel("token", allowedUsername, telegramClient, agent, new ChannelRegistry());
return new TelegramChannel("token", allowedUsername, telegramClient, agent, new ChannelRegistry(), speechToTextService);
}

private Update updateFromUnknownUser(String username) {
Expand All @@ -243,7 +250,6 @@ private Update updateFromUnknownUser(String username) {
User user = mock(User.class);
when(update.hasMessage()).thenReturn(true);
when(update.getMessage()).thenReturn(message);
when(message.hasText()).thenReturn(true);
when(message.getFrom()).thenReturn(user);
when(user.getUserName()).thenReturn(username);
return update;
Expand Down
Loading