jobrunr · soufianebouaddis · Mar 26, 2026 · Apr 22, 2026 · Apr 27, 2026 · Apr 27, 2026
diff --git a/base/src/main/java/ai/javaclaw/speech/SpeechToTextException.java b/base/src/main/java/ai/javaclaw/speech/SpeechToTextException.java
@@ -0,0 +1,12 @@
+package ai.javaclaw.speech;
+
+public class SpeechToTextException extends RuntimeException {
+
+    public SpeechToTextException(String message) {
+        super(message);
+    }
+
+    public SpeechToTextException(String message, Throwable cause) {
+        super(message, cause);
+    }
+}
diff --git a/base/src/main/java/ai/javaclaw/speech/SpeechToTextService.java b/base/src/main/java/ai/javaclaw/speech/SpeechToTextService.java
@@ -0,0 +1,8 @@
+package ai.javaclaw.speech;
+
+import java.io.InputStream;
+
+public interface SpeechToTextService {
+
+    String transcribe(InputStream audioStream);
+}
diff --git a/base/src/main/java/ai/javaclaw/speech/WhisperCppSpeechToTextService.java b/base/src/main/java/ai/javaclaw/speech/WhisperCppSpeechToTextService.java
@@ -0,0 +1,121 @@
+package ai.javaclaw.speech;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
+import org.springframework.stereotype.Service;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.concurrent.TimeUnit;
+
+@Service
+@ConditionalOnProperty(name = "speech.provider", havingValue = "whisper-cpp")
+public class WhisperCppSpeechToTextService implements SpeechToTextService {
+
+    private static final Logger LOGGER = LoggerFactory.getLogger(WhisperCppSpeechToTextService.class);
+
+    private final String modelPath;
+
+    public WhisperCppSpeechToTextService(
+            @Value("${speech.whisper-cpp.model-path}") String modelPath) {
+        this.modelPath = modelPath;
+    }
+
+    @Override
+    public String transcribe(InputStream audioStream) {
+        LOGGER.info("Transcribing audio via whisper-cpp (model: {})", modelPath);
+
+        Path oggFile = null;
+        Path wavFile = null;
+        Path outputFile = null;
+
+        try {
+            oggFile = Files.createTempFile("whisper-input-", ".ogg");
+            Files.write(oggFile, audioStream.readAllBytes());
+
+            wavFile = Files.createTempFile("whisper-input-", ".wav");
+            convertOggToWav(oggFile, wavFile);
+
+            outputFile = Files.createTempFile("whisper-output-", ".txt");
+            Files.deleteIfExists(outputFile);
+
+            ProcessBuilder pb = new ProcessBuilder(
+                    "whisper-cli",
+                    "-m", modelPath,
+                    "-f", wavFile.toString(),
+                    "-otxt",
+                    "-of", outputFile.toString().replace(".txt", ""),
+                    "--no-prints"
+            );
+            pb.redirectErrorStream(true);
+
+            Process process = pb.start();
+            boolean finished = process.waitFor(60, TimeUnit.SECONDS);
+
+            if (!finished) {
+                process.destroyForcibly();
+                throw new SpeechToTextException("whisper-cli timed out after 60 seconds");
+            }
+
+            if (process.exitValue() != 0) {
+                String error = new String(process.getInputStream().readAllBytes());
+                throw new SpeechToTextException("whisper-cli exited with code " + process.exitValue() + ": " + error);
+            }
+
+            if (!Files.exists(outputFile)) {
+                throw new SpeechToTextException("whisper-cli did not produce output file");
+            }
+
+            String text = Files.readString(outputFile).trim();
+            if (text.isBlank()) {
+                throw new SpeechToTextException("whisper-cli returned empty transcription");
+            }
+
+            LOGGER.info("whisper-cpp transcription completed successfully");
+            return text;
+
+        } catch (IOException | InterruptedException e) {
+            if (e instanceof InterruptedException) {
+                Thread.currentThread().interrupt();
+            }
+            throw new SpeechToTextException("Failed to run whisper-cli", e);
+        } finally {
+            deleteSilently(oggFile);
+            deleteSilently(wavFile);
+            deleteSilently(outputFile);
+        }
+    }
+
+    private void convertOggToWav(Path oggFile, Path wavFile) throws IOException, InterruptedException {
+        ProcessBuilder pb = new ProcessBuilder(
+                "ffmpeg", "-y", "-i", oggFile.toString(), "-ar", "16000", "-ac", "1", wavFile.toString()
+        );
+        pb.redirectErrorStream(true);
+
+        Process process = pb.start();
+        boolean finished = process.waitFor(30, TimeUnit.SECONDS);
+
+        if (!finished) {
+            process.destroyForcibly();
+            throw new SpeechToTextException("ffmpeg conversion timed out");
+        }
+
+        if (process.exitValue() != 0) {
+            String error = new String(process.getInputStream().readAllBytes());
+            throw new SpeechToTextException("ffmpeg conversion failed: " + error);
+        }
+    }
+
+    private void deleteSilently(Path path) {
+        if (path != null) {
+            try {
+                Files.deleteIfExists(path);
+            } catch (IOException ignored) {
+            }
+        }
+    }
+}
diff --git a/base/src/test/java/ai/javaclaw/speech/MockSpeechToTextService.java b/base/src/test/java/ai/javaclaw/speech/MockSpeechToTextService.java
@@ -0,0 +1,11 @@
+package ai.javaclaw.speech;
+
+import java.io.InputStream;
+
+public class MockSpeechToTextService implements SpeechToTextService {
+
+    @Override
+    public String transcribe(InputStream audioStream) {
+        return "[voice message]";
+    }
+}
diff --git a/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramChannel.java b/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramChannel.java
@@ -4,7 +4,11 @@
 import ai.javaclaw.channels.Channel;
 import ai.javaclaw.channels.ChannelMessageReceivedEvent;
 import ai.javaclaw.channels.ChannelRegistry;
+import ai.javaclaw.speech.SpeechToTextService;
 import org.commonmark.ext.gfm.strikethrough.StrikethroughExtension;
+import org.commonmark.node.Node;
+import org.commonmark.parser.Parser;
+import org.commonmark.renderer.html.HtmlRenderer;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.telegram.telegrambots.client.okhttp.OkHttpTelegramClient;
@@ -17,11 +21,11 @@
 import org.telegram.telegrambots.meta.api.objects.message.Message;
 import org.telegram.telegrambots.meta.exceptions.TelegramApiException;
 import org.telegram.telegrambots.meta.generics.TelegramClient;
-import org.commonmark.node.Node;
-import org.commonmark.parser.Parser;
-import org.commonmark.renderer.html.HtmlRenderer;
 
+import java.io.IOException;
+import java.io.InputStream;
 import java.util.List;
+import java.util.Optional;
 
 import static java.util.Optional.ofNullable;
 
@@ -40,18 +44,26 @@ public class TelegramChannel implements Channel, SpringLongPollingBot, LongPolli
     private final TelegramClient telegramClient;
     private final Agent agent;
     private final ChannelRegistry channelRegistry;
+    private final SpeechToTextService speechToTextService;
+    private final TelegramVoiceDownloader voiceDownloader;
     private Long chatId;
 
-    public TelegramChannel(String botToken, String allowedUsername, Agent agent, ChannelRegistry channelRegistry) {
-        this(botToken, allowedUsername, new OkHttpTelegramClient(botToken), agent, channelRegistry);
+    public TelegramChannel(String botToken, String allowedUsername, Agent agent, ChannelRegistry channelRegistry, SpeechToTextService speechToTextService) {
+        this(botToken, allowedUsername, new OkHttpTelegramClient(botToken), agent, channelRegistry, speechToTextService);
+    }
+
+    TelegramChannel(String botToken, String allowedUsername, TelegramClient telegramClient, Agent agent, ChannelRegistry channelRegistry, SpeechToTextService speechToTextService) {
+        this(botToken, allowedUsername, telegramClient, agent, channelRegistry, speechToTextService, new TelegramVoiceDownloader(telegramClient, botToken));
     }
 
-    TelegramChannel(String botToken, String allowedUsername, TelegramClient telegramClient, Agent agent, ChannelRegistry channelRegistry) {
+    TelegramChannel(String botToken, String allowedUsername, TelegramClient telegramClient, Agent agent, ChannelRegistry channelRegistry, SpeechToTextService speechToTextService, TelegramVoiceDownloader voiceDownloader) {
         this.botToken = botToken;
         this.allowedUsername = normalizeUsername(allowedUsername);
         this.telegramClient = telegramClient;
         this.agent = agent;
         this.channelRegistry = channelRegistry;
+        this.speechToTextService = speechToTextService;
+        this.voiceDownloader = voiceDownloader;
         channelRegistry.registerChannel(this);
         LOGGER.info("Started Telegram integration");
     }
@@ -68,7 +80,7 @@ public LongPollingUpdateConsumer getUpdatesConsumer() {
 
     @Override
     public void consume(Update update) {
-        if (!(update.hasMessage() && update.getMessage().hasText())) return;
+        if (!update.hasMessage()) return;
 
         Message requestMessage = update.getMessage();
         String userName = requestMessage.getFrom() == null ? null : requestMessage.getFrom().getUserName();
@@ -78,11 +90,13 @@ public void consume(Update update) {
             return;
         }
 
-        String messageText = requestMessage.getText();
+        Optional<String> messageText = resolveMessageText(requestMessage);
+        if (messageText.isEmpty()) return;
+
         this.chatId = requestMessage.getChatId();
         Integer messageThreadId = requestMessage.getMessageThreadId();
-        channelRegistry.publishMessageReceivedEvent(new TelegramChannelMessageReceivedEvent(getName(), messageText, chatId, messageThreadId));
-        String response = agent.respondTo(getConversationId(chatId, messageThreadId), messageText);
+        channelRegistry.publishMessageReceivedEvent(new TelegramChannelMessageReceivedEvent(getName(), messageText.get(), chatId, messageThreadId));
+        String response = agent.respondTo(getConversationId(chatId, messageThreadId), messageText.get());
         sendMessage(chatId, messageThreadId, response);
     }
 
@@ -124,6 +138,28 @@ public void sendMessage(long chatId, Integer messageThreadId, String message) {
         }
     }
 
+    private Optional<String> resolveMessageText(Message message) {
+        if (message.hasText()) {
+            return Optional.of(message.getText());
+        }
+        if (message.hasVoice()) {
+            return transcribeVoice(message);
+        }
+        return Optional.empty();
+    }
+
+    private Optional<String> transcribeVoice(Message message) {
+        LOGGER.info("Voice message received, downloading audio");
+        try (InputStream voiceStream = voiceDownloader.download(message)) {
+            String transcribed = speechToTextService.transcribe(voiceStream);
+            LOGGER.info("Voice message transcribed successfully");
+            return Optional.of(transcribed);
+        } catch (IOException | TelegramApiException e) {
+            LOGGER.error("Failed to process voice message", e);
+            return Optional.empty();
+        }
+    }
+
     private String convertMarkdownToTelegramHtml(String markdown) {
         if (markdown == null || markdown.isBlank()) return "";
 
@@ -181,4 +217,4 @@ public Integer getMessageThreadId() {
             return messageThreadId;
         }
     }
-}
+}
diff --git a/...elegram/src/main/java/ai/javaclaw/channels/telegram/TelegramChannelAutoConfiguration.java b/...elegram/src/main/java/ai/javaclaw/channels/telegram/TelegramChannelAutoConfiguration.java
@@ -3,6 +3,7 @@
 
 import ai.javaclaw.agent.Agent;
 import ai.javaclaw.channels.ChannelRegistry;
+import ai.javaclaw.speech.SpeechToTextService;
 import org.springframework.beans.factory.annotation.Value;
 import org.springframework.boot.autoconfigure.AutoConfiguration;
 import org.springframework.boot.autoconfigure.condition.ConditionalOnMissingBean;
@@ -19,7 +20,8 @@ public class TelegramChannelAutoConfiguration {
     public TelegramChannel telegramChannel(@Value("${agent.channels.telegram.token:null}") String botToken,
                                            @Value("${agent.channels.telegram.username:null}") String allowedUsername,
                                            Agent agent,
-                                           ChannelRegistry channelRegistry) {
-        return new TelegramChannel(botToken, allowedUsername, agent, channelRegistry);
+                                           ChannelRegistry channelRegistry,
+                                           SpeechToTextService speechToTextService) {
+        return new TelegramChannel(botToken, allowedUsername, agent, channelRegistry, speechToTextService);
     }
 }
diff --git a/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramVoiceDownloader.java b/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramVoiceDownloader.java
@@ -0,0 +1,29 @@
+package ai.javaclaw.channels.telegram;
+
+import org.telegram.telegrambots.meta.api.methods.GetFile;
+import org.telegram.telegrambots.meta.api.objects.message.Message;
+import org.telegram.telegrambots.meta.exceptions.TelegramApiException;
+import org.telegram.telegrambots.meta.generics.TelegramClient;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URI;
+
+class TelegramVoiceDownloader {
+
+    private final TelegramClient telegramClient;
+    private final String botToken;
+
+    TelegramVoiceDownloader(TelegramClient telegramClient, String botToken) {
+        this.telegramClient = telegramClient;
+        this.botToken = botToken;
+    }
+
+    InputStream download(Message message) throws TelegramApiException, IOException {
+        String fileId = message.getVoice().getFileId();
+        GetFile getFile = new GetFile(fileId);
+        String filePath = telegramClient.execute(getFile).getFilePath();
+        String fileUrl = "https://api.telegram.org/file/bot" + botToken + "/" + filePath;
+        return URI.create(fileUrl).toURL().openStream();
+    }
+}
diff --git a/plugins/telegram/src/test/java/ai/javaclaw/channels/telegram/TelegramChannelTest.java b/plugins/telegram/src/test/java/ai/javaclaw/channels/telegram/TelegramChannelTest.java
@@ -2,6 +2,7 @@
 
 import ai.javaclaw.agent.Agent;
 import ai.javaclaw.channels.ChannelRegistry;
+import ai.javaclaw.speech.SpeechToTextService;
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.api.extension.ExtendWith;
 import org.mockito.Mock;
@@ -32,6 +33,9 @@ class TelegramChannelTest {
     @Mock
     private Agent agent;
 
+    @Mock
+    private SpeechToTextService speechToTextService;
+
     // -----------------------------------------------------------------------
     // Ignored updates
     // -----------------------------------------------------------------------
@@ -48,13 +52,17 @@ void ignoresUpdatesWithoutMessage() {
     }
 
     @Test
-    void ignoresUpdatesWithoutText() {
+    void ignoresUpdatesWithoutTextOrVoice() {
         TelegramChannel channel = channel("allowed_user");
         Update update = mock(Update.class);
         Message message = mock(Message.class);
+        User user = mock(User.class);
         when(update.hasMessage()).thenReturn(true);
         when(update.getMessage()).thenReturn(message);
+        when(message.getFrom()).thenReturn(user);
+        when(user.getUserName()).thenReturn("allowed_user");
         when(message.hasText()).thenReturn(false);
+        when(message.hasVoice()).thenReturn(false);
 
         channel.consume(update);
 
@@ -68,7 +76,6 @@ void ignoresMessagesFromNullUsername() {
         Message message = mock(Message.class);
         when(update.hasMessage()).thenReturn(true);
         when(update.getMessage()).thenReturn(message);
-        when(message.hasText()).thenReturn(true);
         when(message.getFrom()).thenReturn(null);
 
         channel.consume(update);
@@ -234,7 +241,7 @@ void sendMessageFallbacksToSendingRawTextWhenFailingToSendHtml() throws Telegram
     // -----------------------------------------------------------------------
 
     private TelegramChannel channel(String allowedUsername) {
-        return new TelegramChannel("token", allowedUsername, telegramClient, agent, new ChannelRegistry());
+        return new TelegramChannel("token", allowedUsername, telegramClient, agent, new ChannelRegistry(), speechToTextService);
     }
 
     private Update updateFromUnknownUser(String username) {
@@ -243,7 +250,6 @@ private Update updateFromUnknownUser(String username) {
         User user = mock(User.class);
         when(update.hasMessage()).thenReturn(true);
         when(update.getMessage()).thenReturn(message);
-        when(message.hasText()).thenReturn(true);
         when(message.getFrom()).thenReturn(user);
         when(user.getUserName()).thenReturn(username);
         return update;