diff --git a/AGENTS.md b/AGENTS.md
index 6946d441..8f45cac0 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -5,7 +5,7 @@
 
 ## Overview
 
-macOS menu bar companion app. Lives entirely in the macOS status bar (no dock icon, no main window). Clicking the menu bar icon opens a custom floating panel with companion voice controls. Uses push-to-talk (ctrl+option) to capture voice input, transcribes it via AssemblyAI streaming, and sends the transcript + a screenshot of the user's screen to Claude. Claude responds with text (streamed via SSE) and voice (ElevenLabs TTS). A blue cursor overlay can fly to and point at UI elements Claude references on any connected monitor.
+macOS menu bar companion app. Lives entirely in the macOS status bar (no dock icon, no main window). Clicking the menu bar icon opens a custom floating panel with companion voice controls. Uses push-to-talk (ctrl+option) to capture voice input, transcribes it via AssemblyAI streaming, and sends the transcript + a screenshot of the user's screen to the selected chat model through a Cloudflare Worker proxy. The app can route chat to Claude, OpenAI, or Gemini, then plays the spoken response through ElevenLabs TTS. A blue cursor overlay can fly to and point at UI elements the model references on any connected monitor.
 
 All API keys live on a Cloudflare Worker proxy — nothing sensitive ships in the app.
 
@@ -14,7 +14,7 @@ All API keys live on a Cloudflare Worker proxy — nothing sensitive ships in th
 - **App Type**: Menu bar-only (`LSUIElement=true`), no dock icon or main window
 - **Framework**: SwiftUI (macOS native) with AppKit bridging for menu bar panel and cursor overlay
 - **Pattern**: MVVM with `@StateObject` / `@Published` state management
-- **AI Chat**: Claude (Sonnet 4.6 default, Opus 4.6 optional) via Cloudflare Worker proxy with SSE streaming
+- **AI Chat**: Multi-provider chat via Cloudflare Worker proxy. Claude (`claude-sonnet-4-6`, `claude-opus-4-6`), OpenAI (`gpt-5.4`), and Gemini (`gemini-2.5-flash`) all route through one normalized SSE chat endpoint
 - **Speech-to-Text**: AssemblyAI real-time streaming (`u3-rt-pro` model) via websocket, with OpenAI and Apple Speech as fallbacks
 - **Text-to-Speech**: ElevenLabs (`eleven_flash_v2_5` model) via Cloudflare Worker proxy
 - **Screen Capture**: ScreenCaptureKit (macOS 14.2+), multi-monitor support
@@ -29,11 +29,11 @@ The app never calls external APIs directly. All requests go through a Cloudflare
 
 | Route | Upstream | Purpose |
 |-------|----------|---------|
-| `POST /chat` | `api.anthropic.com/v1/messages` | Claude vision + streaming chat |
+| `POST /chat` | `api.anthropic.com/v1/messages`, `api.openai.com/v1/responses`, `generativelanguage.googleapis.com/v1beta/models/*:generateContent` | Multi-provider vision chat normalized into a single SSE response shape |
 | `POST /tts` | `api.elevenlabs.io/v1/text-to-speech/{voiceId}` | ElevenLabs TTS audio |
 | `POST /transcribe-token` | `streaming.assemblyai.com/v3/token` | Fetches a short-lived (480s) AssemblyAI websocket token |
 
-Worker secrets: `ANTHROPIC_API_KEY`, `ASSEMBLYAI_API_KEY`, `ELEVENLABS_API_KEY`
+Worker secrets: `ANTHROPIC_API_KEY`, `OPENAI_API_KEY`, `GEMINI_API_KEY`, `ASSEMBLYAI_API_KEY`, `ELEVENLABS_API_KEY`
 Worker vars: `ELEVENLABS_VOICE_ID`
 
 ### Key Architecture Decisions
@@ -53,9 +53,9 @@ Worker vars: `ELEVENLABS_VOICE_ID`
 | File | Lines | Purpose |
 |------|-------|---------|
 | `leanring_buddyApp.swift` | ~89 | Menu bar app entry point. Uses `@NSApplicationDelegateAdaptor` with `CompanionAppDelegate` which creates `MenuBarPanelManager` and starts `CompanionManager`. No main window — the app lives entirely in the status bar. |
-| `CompanionManager.swift` | ~1026 | Central state machine. Owns dictation, shortcut monitoring, screen capture, Claude API, ElevenLabs TTS, and overlay management. Tracks voice state (idle/listening/processing/responding), conversation history, model selection, and cursor visibility. Coordinates the full push-to-talk → screenshot → Claude → TTS → pointing pipeline. |
+| `CompanionManager.swift` | ~1040 | Central state machine. Owns dictation, shortcut monitoring, screen capture, provider-agnostic chat API, ElevenLabs TTS, and overlay management. Tracks voice state (idle/listening/processing/responding), conversation history, model selection, and cursor visibility. Coordinates the full push-to-talk → screenshot → model response → TTS → pointing pipeline. |
 | `MenuBarPanelManager.swift` | ~243 | NSStatusItem + custom NSPanel lifecycle. Creates the menu bar icon, manages the floating companion panel (show/hide/position), installs click-outside-to-dismiss monitor. |
-| `CompanionPanelView.swift` | ~761 | SwiftUI panel content for the menu bar dropdown. Shows companion status, push-to-talk instructions, model picker (Sonnet/Opus), permissions UI, DM feedback button, and quit button. Dark aesthetic using `DS` design system. |
+| `CompanionPanelView.swift` | ~761 | SwiftUI panel content for the menu bar dropdown. Shows companion status, push-to-talk instructions, model picker (Claude, OpenAI, Gemini options), permissions UI, DM feedback button, and quit button. Dark aesthetic using `DS` design system. |
 | `OverlayWindow.swift` | ~881 | Full-screen transparent overlay hosting the blue cursor, response text, waveform, and spinner. Handles cursor animation, element pointing with bezier arcs, multi-monitor coordinate mapping, and fade-out transitions. |
 | `CompanionResponseOverlay.swift` | ~217 | SwiftUI view for the response text bubble and waveform displayed next to the cursor in the overlay. |
 | `CompanionScreenCaptureUtility.swift` | ~132 | Multi-monitor screenshot capture using ScreenCaptureKit. Returns labeled image data for each connected display. |
@@ -66,7 +66,7 @@ Worker vars: `ELEVENLABS_VOICE_ID`
 | `AppleSpeechTranscriptionProvider.swift` | ~147 | Local fallback transcription provider backed by Apple's Speech framework. |
 | `BuddyAudioConversionSupport.swift` | ~108 | Audio conversion helpers. Converts live mic buffers to PCM16 mono audio and builds WAV payloads for upload-based providers. |
 | `GlobalPushToTalkShortcutMonitor.swift` | ~132 | System-wide push-to-talk monitor. Owns the listen-only `CGEvent` tap and publishes press/release transitions. |
-| `ClaudeAPI.swift` | ~291 | Claude vision API client with streaming (SSE) and non-streaming modes. TLS warmup optimization, image MIME detection, conversation history support. |
+| `ClaudeAPI.swift` | ~170 | Provider-agnostic chat client that sends normalized multimodal requests to the Worker and parses a normalized SSE response stream for Claude, OpenAI, and Gemini. |
 | `OpenAIAPI.swift` | ~142 | OpenAI GPT vision API client. |
 | `ElevenLabsTTSClient.swift` | ~81 | ElevenLabs TTS client. Sends text to the Worker proxy, plays back audio via `AVAudioPlayer`. Exposes `isPlaying` for transient cursor scheduling. |
 | `ElementLocationDetector.swift` | ~335 | Detects UI element locations in screenshots for cursor pointing. |
@@ -74,7 +74,7 @@ Worker vars: `ELEVENLABS_VOICE_ID`
 | `ClickyAnalytics.swift` | ~121 | PostHog analytics integration for usage tracking. |
 | `WindowPositionManager.swift` | ~262 | Window placement logic, Screen Recording permission flow, and accessibility permission helpers. |
 | `AppBundleConfiguration.swift` | ~28 | Runtime configuration reader for keys stored in the app bundle Info.plist. |
-| `worker/src/index.ts` | ~142 | Cloudflare Worker proxy. Three routes: `/chat` (Claude), `/tts` (ElevenLabs), `/transcribe-token` (AssemblyAI temp token). |
+| `worker/src/index.ts` | ~360 | Cloudflare Worker proxy. Three routes: `/chat` (Claude/OpenAI/Gemini multimodal chat normalized to a shared SSE format), `/tts` (ElevenLabs), `/transcribe-token` (AssemblyAI temp token). |
 
 ## Build & Run
 
@@ -98,6 +98,8 @@ npm install
 
 # Add secrets
 npx wrangler secret put ANTHROPIC_API_KEY
+npx wrangler secret put OPENAI_API_KEY
+npx wrangler secret put GEMINI_API_KEY
 npx wrangler secret put ASSEMBLYAI_API_KEY
 npx wrangler secret put ELEVENLABS_API_KEY
 
diff --git a/README.md b/README.md
index d7dbf74b..22686651 100644
--- a/README.md
+++ b/README.md
@@ -37,7 +37,7 @@ If you want to do it yourself, here's the deal.
 - Xcode 15+
 - Node.js 18+ (for the Cloudflare Worker)
 - A [Cloudflare](https://cloudflare.com) account (free tier works)
-- API keys for: [Anthropic](https://console.anthropic.com), [AssemblyAI](https://www.assemblyai.com), [ElevenLabs](https://elevenlabs.io)
+- API keys for: [Anthropic](https://console.anthropic.com), [OpenAI](https://platform.openai.com), [Google AI Studio / Gemini](https://aistudio.google.com), [AssemblyAI](https://www.assemblyai.com), [ElevenLabs](https://elevenlabs.io)
 
 ### 1. Set up the Cloudflare Worker
 
@@ -52,6 +52,8 @@ Now add your secrets. Wrangler will prompt you to paste each one:
 
 ```bash
 npx wrangler secret put ANTHROPIC_API_KEY
+npx wrangler secret put OPENAI_API_KEY
+npx wrangler secret put GEMINI_API_KEY
 npx wrangler secret put ASSEMBLYAI_API_KEY
 npx wrangler secret put ELEVENLABS_API_KEY
 ```
@@ -84,6 +86,8 @@ This starts a local server (usually `http://localhost:8787`) that behaves exactl
 
 ```
 ANTHROPIC_API_KEY=sk-ant-...
+OPENAI_API_KEY=sk-...
+GEMINI_API_KEY=...
 ASSEMBLYAI_API_KEY=...
 ELEVENLABS_API_KEY=...
 ELEVENLABS_VOICE_ID=...
diff --git a/leanring-buddy/ClaudeAPI.swift b/leanring-buddy/ClaudeAPI.swift
index 0c7070b5..9cbb2a44 100644
--- a/leanring-buddy/ClaudeAPI.swift
+++ b/leanring-buddy/ClaudeAPI.swift
@@ -1,55 +1,43 @@
 //
 //  ClaudeAPI.swift
-//  Claude API Implementation with streaming support
+//  Provider-agnostic companion chat API with normalized SSE parsing
 //
 
 import Foundation
 
-/// Claude API helper with streaming for progressive text display.
-class ClaudeAPI {
+final class CompanionChatAPI {
     private static let tlsWarmupLock = NSLock()
     private static var hasStartedTLSWarmup = false
 
-    private let apiURL: URL
+    private let proxyURL: URL
     var model: String
     private let session: URLSession
 
     init(proxyURL: String, model: String = "claude-sonnet-4-6") {
-        self.apiURL = URL(string: proxyURL)!
+        self.proxyURL = URL(string: proxyURL)!
         self.model = model
 
-        // Use .default instead of .ephemeral so TLS session tickets are cached.
-        // Ephemeral sessions do a full TLS handshake on every request, which causes
-        // transient -1200 (errSSLPeerHandshakeFail) errors with large image payloads.
-        // Disable URL/cookie caching to avoid storing responses or credentials on disk.
-        let config = URLSessionConfiguration.default
-        config.timeoutIntervalForRequest = 120
-        config.timeoutIntervalForResource = 300
-        config.waitsForConnectivity = true
-        config.urlCache = nil
-        config.httpCookieStorage = nil
-        self.session = URLSession(configuration: config)
+        let configuration = URLSessionConfiguration.default
+        configuration.timeoutIntervalForRequest = 120
+        configuration.timeoutIntervalForResource = 300
+        configuration.waitsForConnectivity = true
+        configuration.urlCache = nil
+        configuration.httpCookieStorage = nil
+        self.session = URLSession(configuration: configuration)
 
-        // Fire a lightweight HEAD request in the background to pre-establish the TLS
-        // connection. This caches the TLS session ticket so the first real API call
-        // (which carries a large image payload) doesn't need a cold TLS handshake.
         warmUpTLSConnectionIfNeeded()
     }
 
-    private func makeAPIRequest() -> URLRequest {
-        var request = URLRequest(url: apiURL)
+    private func makeRequest() -> URLRequest {
+        var request = URLRequest(url: proxyURL)
         request.httpMethod = "POST"
         request.timeoutInterval = 120
         request.setValue("application/json", forHTTPHeaderField: "Content-Type")
+        request.setValue("text/event-stream", forHTTPHeaderField: "Accept")
         return request
     }
 
-    /// Detects the MIME type of image data by inspecting the first bytes.
-    /// Screen captures from ScreenCaptureKit are JPEG, but pasted images from the
-    /// clipboard are PNG. The API rejects requests where the declared media_type
-    /// doesn't match the actual image format.
     private func detectImageMediaType(for imageData: Data) -> String {
-        // PNG files start with the 8-byte signature: 89 50 4E 47 0D 0A 1A 0A
         if imageData.count >= 4 {
             let pngSignature: [UInt8] = [0x89, 0x50, 0x4E, 0x47]
             let firstFourBytes = [UInt8](imageData.prefix(4))
@@ -57,12 +45,10 @@ class ClaudeAPI {
                 return "image/png"
             }
         }
-        // Default to JPEG — screen captures use JPEG compression
+
         return "image/jpeg"
     }
 
-    /// Sends a no-op HEAD request to the API host to establish and cache a TLS session.
-    /// Failures are silently ignored — this is purely an optimization.
     private func warmUpTLSConnectionIfNeeded() {
         Self.tlsWarmupLock.lock()
         let shouldStartTLSWarmup = !Self.hasStartedTLSWarmup
@@ -73,12 +59,10 @@ class ClaudeAPI {
 
         guard shouldStartTLSWarmup else { return }
 
-        guard var warmupURLComponents = URLComponents(url: apiURL, resolvingAgainstBaseURL: false) else {
+        guard var warmupURLComponents = URLComponents(url: proxyURL, resolvingAgainstBaseURL: false) else {
             return
         }
 
-        // The TLS session ticket is host-scoped, so warming the root host is enough.
-        // Hitting the host instead of `/v1/messages` avoids extra endpoint-specific noise.
         warmupURLComponents.path = "/"
         warmupURLComponents.query = nil
         warmupURLComponents.fragment = nil
@@ -91,13 +75,10 @@ class ClaudeAPI {
         warmupRequest.httpMethod = "HEAD"
         warmupRequest.timeoutInterval = 10
         session.dataTask(with: warmupRequest) { _, _, _ in
-            // Response doesn't matter — the TLS handshake is the goal
+            // Establishing a warm TLS session is enough here.
         }.resume()
     }
 
-    /// Send a vision request to Claude with streaming.
-    /// Calls `onTextChunk` on the main actor each time new text arrives so the UI updates progressively.
-    /// Returns the full accumulated text and total duration when the stream completes.
     func analyzeImageStreaming(
         images: [(data: Data, label: String)],
         systemPrompt: String,
@@ -106,86 +87,64 @@ class ClaudeAPI {
         onTextChunk: @MainActor @Sendable (String) -> Void
     ) async throws -> (text: String, duration: TimeInterval) {
         let startTime = Date()
+        var request = makeRequest()
 
-        var request = makeAPIRequest()
-
-        // Build messages array
-        var messages: [[String: Any]] = []
-
-        for (userPlaceholder, assistantResponse) in conversationHistory {
-            messages.append(["role": "user", "content": userPlaceholder])
-            messages.append(["role": "assistant", "content": assistantResponse])
-        }
-
-        // Build current message with all labeled images + prompt
-        var contentBlocks: [[String: Any]] = []
-        for image in images {
-            contentBlocks.append([
-                "type": "image",
-                "source": [
-                    "type": "base64",
+        let requestBody: [String: Any] = [
+            "model": model,
+            "system_prompt": systemPrompt,
+            "conversation_history": conversationHistory.map { entry in
+                [
+                    "user_transcript": entry.userPlaceholder,
+                    "assistant_response": entry.assistantResponse
+                ]
+            },
+            "images": images.map { image in
+                [
                     "media_type": detectImageMediaType(for: image.data),
-                    "data": image.data.base64EncodedString()
+                    "data": image.data.base64EncodedString(),
+                    "label": image.label
                 ]
-            ])
-            contentBlocks.append([
-                "type": "text",
-                "text": image.label
-            ])
-        }
-        contentBlocks.append([
-            "type": "text",
-            "text": userPrompt
-        ])
-        messages.append(["role": "user", "content": contentBlocks])
-
-        let body: [String: Any] = [
-            "model": model,
-            "max_tokens": 1024,
-            "stream": true,
-            "system": systemPrompt,
-            "messages": messages
+            },
+            "user_prompt": userPrompt
         ]
 
-        let bodyData = try JSONSerialization.data(withJSONObject: body)
+        let bodyData = try JSONSerialization.data(withJSONObject: requestBody)
         request.httpBody = bodyData
+
         let payloadMB = Double(bodyData.count) / 1_048_576.0
-        print("🌐 Claude streaming request: \(String(format: "%.1f", payloadMB))MB, \(images.count) image(s)")
+        print("🌐 Companion chat request: model \(model), \(String(format: "%.1f", payloadMB))MB, \(images.count) image(s)")
 
-        // Use bytes streaming for SSE (Server-Sent Events)
         let (byteStream, response) = try await session.bytes(for: request)
 
         guard let httpResponse = response as? HTTPURLResponse else {
             throw NSError(
-                domain: "ClaudeAPI",
+                domain: "CompanionChatAPI",
                 code: -1,
                 userInfo: [NSLocalizedDescriptionKey: "Invalid HTTP response"]
             )
         }
 
-        // If non-2xx status, read the full body as error text
         guard (200...299).contains(httpResponse.statusCode) else {
             var errorBodyChunks: [String] = []
             for try await line in byteStream.lines {
                 errorBodyChunks.append(line)
             }
-            let errorBody = errorBodyChunks.joined(separator: "\n")
+
             throw NSError(
-                domain: "ClaudeAPI",
+                domain: "CompanionChatAPI",
                 code: httpResponse.statusCode,
-                userInfo: [NSLocalizedDescriptionKey: "API Error (\(httpResponse.statusCode)): \(errorBody)"]
+                userInfo: [
+                    NSLocalizedDescriptionKey: "API Error (\(httpResponse.statusCode)): \(errorBodyChunks.joined(separator: "\n"))"
+                ]
             )
         }
 
-        // Parse SSE stream — each event is "data: {json}\n\n"
         var accumulatedResponseText = ""
 
         for try await line in byteStream.lines {
-            // SSE lines look like: "data: {...}"
             guard line.hasPrefix("data: ") else { continue }
-            let jsonString = String(line.dropFirst(6)) // Drop "data: " prefix
+            let jsonString = String(line.dropFirst(6))
 
-            // End of stream marker
             guard jsonString != "[DONE]" else { break }
 
             guard let jsonData = jsonString.data(using: .utf8),
@@ -194,98 +153,19 @@ class ClaudeAPI {
                 continue
             }
 
-            // We care about content_block_delta events that contain text chunks
             if eventType == "content_block_delta",
                let delta = eventPayload["delta"] as? [String: Any],
                let deltaType = delta["type"] as? String,
                deltaType == "text_delta",
                let textChunk = delta["text"] as? String {
                 accumulatedResponseText += textChunk
-                // Send the accumulated text so far to the UI for progressive rendering
-                let currentAccumulatedText = accumulatedResponseText
-                await onTextChunk(currentAccumulatedText)
+                await onTextChunk(accumulatedResponseText)
             }
         }
 
         let duration = Date().timeIntervalSince(startTime)
         return (text: accumulatedResponseText, duration: duration)
     }
-
-    /// Non-streaming fallback for validation requests where we don't need progressive display.
-    func analyzeImage(
-        images: [(data: Data, label: String)],
-        systemPrompt: String,
-        conversationHistory: [(userPlaceholder: String, assistantResponse: String)] = [],
-        userPrompt: String
-    ) async throws -> (text: String, duration: TimeInterval) {
-        let startTime = Date()
-
-        var request = makeAPIRequest()
-
-        var messages: [[String: Any]] = []
-        for (userPlaceholder, assistantResponse) in conversationHistory {
-            messages.append(["role": "user", "content": userPlaceholder])
-            messages.append(["role": "assistant", "content": assistantResponse])
-        }
-
-        // Build current message with all labeled images + prompt
-        var contentBlocks: [[String: Any]] = []
-        for image in images {
-            contentBlocks.append([
-                "type": "image",
-                "source": [
-                    "type": "base64",
-                    "media_type": detectImageMediaType(for: image.data),
-                    "data": image.data.base64EncodedString()
-                ]
-            ])
-            contentBlocks.append([
-                "type": "text",
-                "text": image.label
-            ])
-        }
-        contentBlocks.append([
-            "type": "text",
-            "text": userPrompt
-        ])
-        messages.append(["role": "user", "content": contentBlocks])
-
-        let body: [String: Any] = [
-            "model": model,
-            "max_tokens": 256,
-            "system": systemPrompt,
-            "messages": messages
-        ]
-
-        let bodyData = try JSONSerialization.data(withJSONObject: body)
-        request.httpBody = bodyData
-        let payloadMB = Double(bodyData.count) / 1_048_576.0
-        print("🌐 Claude request: \(String(format: "%.1f", payloadMB))MB, \(images.count) image(s)")
-
-        let (data, response) = try await session.data(for: request)
-
-        guard let httpResponse = response as? HTTPURLResponse,
-              (200...299).contains(httpResponse.statusCode) else {
-            let responseString = String(data: data, encoding: .utf8) ?? "Unknown error"
-            throw NSError(
-                domain: "ClaudeAPI",
-                code: (response as? HTTPURLResponse)?.statusCode ?? -1,
-                userInfo: [NSLocalizedDescriptionKey: "API Error: \(responseString)"]
-            )
-        }
-
-        let json = try JSONSerialization.jsonObject(with: data) as? [String: Any]
-        guard let content = json?["content"] as? [[String: Any]],
-              let textBlock = content.first(where: { ($0["type"] as? String) == "text" }),
-              let text = textBlock["text"] as? String else {
-            throw NSError(
-                domain: "ClaudeAPI",
-                code: -1,
-                userInfo: [NSLocalizedDescriptionKey: "Invalid response format"]
-            )
-        }
-
-        let duration = Date().timeIntervalSince(startTime)
-        return (text: text, duration: duration)
-    }
 }
+
+typealias ClaudeAPI = CompanionChatAPI
diff --git a/leanring-buddy/CompanionManager.swift b/leanring-buddy/CompanionManager.swift
index 0234cf19..8d7f3771 100644
--- a/leanring-buddy/CompanionManager.swift
+++ b/leanring-buddy/CompanionManager.swift
@@ -21,6 +21,18 @@ enum CompanionVoiceState {
     case responding
 }
 
+struct CompanionChatModelOption: Identifiable {
+    let id: String
+    let label: String
+
+    static let allOptions: [CompanionChatModelOption] = [
+        CompanionChatModelOption(id: "claude-sonnet-4-6", label: "Sonnet"),
+        CompanionChatModelOption(id: "claude-opus-4-6", label: "Opus"),
+        CompanionChatModelOption(id: "gpt-5.4", label: "GPT-5.4"),
+        CompanionChatModelOption(id: "gemini-2.5-flash", label: "Gemini 2.5")
+    ]
+}
+
 @MainActor
 final class CompanionManager: ObservableObject {
     @Published private(set) var voiceState: CompanionVoiceState = .idle
@@ -32,8 +44,8 @@ final class CompanionManager: ObservableObject {
     @Published private(set) var hasScreenContentPermission = false
 
     /// Screen location (global AppKit coords) of a detected UI element the
-    /// buddy should fly to and point at. Parsed from Claude's response;
-    /// observed by BlueCursorView to trigger the flight animation.
+    /// buddy should fly to and point at. Parsed from the selected model's
+    /// response; observed by BlueCursorView to trigger the flight animation.
     @Published var detectedElementScreenLocation: CGPoint?
     /// The display frame (global AppKit coords) of the screen the detected
     /// element is on, so BlueCursorView knows which screen overlay should animate.
@@ -72,16 +84,16 @@ final class CompanionManager: ObservableObject {
     /// through this so keys never ship in the app binary.
     private static let workerBaseURL = "https://your-worker-name.your-subdomain.workers.dev"
 
-    private lazy var claudeAPI: ClaudeAPI = {
-        return ClaudeAPI(proxyURL: "\(Self.workerBaseURL)/chat", model: selectedModel)
+    private lazy var chatAPI: CompanionChatAPI = {
+        return CompanionChatAPI(proxyURL: "\(Self.workerBaseURL)/chat", model: selectedModel)
     }()
 
     private lazy var elevenLabsTTSClient: ElevenLabsTTSClient = {
         return ElevenLabsTTSClient(proxyURL: "\(Self.workerBaseURL)/tts")
     }()
 
-    /// Conversation history so Claude remembers prior exchanges within a session.
-    /// Each entry is the user's transcript and Claude's response.
+    /// Conversation history so the selected model remembers prior exchanges within a session.
+    /// Each entry is the user's transcript and the assistant's response.
     private var conversationHistory: [(userTranscript: String, assistantResponse: String)] = []
 
     /// The currently running AI response task, if any. Cancelled when the user
@@ -107,13 +119,16 @@ final class CompanionManager: ObservableObject {
     /// Used by the panel to show accurate status text ("Active" vs "Ready").
     @Published private(set) var isOverlayVisible: Bool = false
 
-    /// The Claude model used for voice responses. Persisted to UserDefaults.
-    @Published var selectedModel: String = UserDefaults.standard.string(forKey: "selectedClaudeModel") ?? "claude-sonnet-4-6"
+    /// The currently selected chat model for voice responses. Persisted to UserDefaults.
+    @Published var selectedModel: String = UserDefaults.standard.string(forKey: "selectedChatModel")
+        ?? UserDefaults.standard.string(forKey: "selectedClaudeModel")
+        ?? "claude-sonnet-4-6"
 
     func setSelectedModel(_ model: String) {
         selectedModel = model
+        UserDefaults.standard.set(model, forKey: "selectedChatModel")
         UserDefaults.standard.set(model, forKey: "selectedClaudeModel")
-        claudeAPI.model = model
+        chatAPI.model = model
     }
 
     /// User preference for whether the Clicky cursor should be shown.
@@ -179,9 +194,9 @@ final class CompanionManager: ObservableObject {
         bindVoiceStateObservation()
         bindAudioPowerLevel()
         bindShortcutTransitions()
-        // Eagerly touch the Claude API so its TLS warmup handshake completes
+        // Eagerly touch the chat API so its TLS warmup handshake completes
         // well before the onboarding demo fires at ~40s into the video.
-        _ = claudeAPI
+        _ = chatAPI
 
         // If the user already completed onboarding AND all permissions are
         // still granted, show the cursor overlay immediately. If permissions
@@ -521,7 +536,7 @@ final class CompanionManager: ObservableObject {
                         self?.lastTranscript = finalTranscript
                         print("🗣️ Companion received transcript: \(finalTranscript)")
                         ClickyAnalytics.trackUserMessageSent(transcript: finalTranscript)
-                        self?.sendTranscriptToClaudeWithScreenshot(transcript: finalTranscript)
+                        self?.sendTranscriptToSelectedModelWithScreenshot(transcript: finalTranscript)
                     }
                 )
             }
@@ -578,12 +593,12 @@ final class CompanionManager: ObservableObject {
 
     // MARK: - AI Response Pipeline
 
-    /// Captures a screenshot, sends it along with the transcript to Claude,
+    /// Captures a screenshot, sends it along with the transcript to the selected model,
     /// and plays the response aloud via ElevenLabs TTS. The cursor stays in
     /// the spinner/processing state until TTS audio begins playing.
-    /// Claude's response may include a [POINT:x,y:label] tag which triggers
+    /// The model's response may include a [POINT:x,y:label] tag which triggers
     /// the buddy to fly to that element on screen.
-    private func sendTranscriptToClaudeWithScreenshot(transcript: String) {
+    private func sendTranscriptToSelectedModelWithScreenshot(transcript: String) {
         currentResponseTask?.cancel()
         elevenLabsTTSClient.stopPlayback()
 
@@ -598,19 +613,19 @@ final class CompanionManager: ObservableObject {
                 guard !Task.isCancelled else { return }
 
                 // Build image labels with the actual screenshot pixel dimensions
-                // so Claude's coordinate space matches the image it sees. We
+                // so the model's coordinate space matches the image it sees. We
                 // scale from screenshot pixels to display points ourselves.
                 let labeledImages = screenCaptures.map { capture in
                     let dimensionInfo = " (image dimensions: \(capture.screenshotWidthInPixels)x\(capture.screenshotHeightInPixels) pixels)"
                     return (data: capture.imageData, label: capture.label + dimensionInfo)
                 }
 
-                // Pass conversation history so Claude remembers prior exchanges
+                // Pass conversation history so the selected model remembers prior exchanges
                 let historyForAPI = conversationHistory.map { entry in
                     (userPlaceholder: entry.userTranscript, assistantResponse: entry.assistantResponse)
                 }
 
-                let (fullResponseText, _) = try await claudeAPI.analyzeImageStreaming(
+                let (fullResponseText, _) = try await chatAPI.analyzeImageStreaming(
                     images: labeledImages,
                     systemPrompt: Self.companionVoiceResponseSystemPrompt,
                     conversationHistory: historyForAPI,
@@ -622,11 +637,11 @@ final class CompanionManager: ObservableObject {
 
                 guard !Task.isCancelled else { return }
 
-                // Parse the [POINT:...] tag from Claude's response
+                // Parse the [POINT:...] tag from the model's response
                 let parseResult = Self.parsePointingCoordinates(from: fullResponseText)
                 let spokenText = parseResult.spokenText
 
-                // Handle element pointing if Claude returned coordinates.
+                // Handle element pointing if the model returned coordinates.
                 // Switch to idle BEFORE setting the location so the triangle
                 // becomes visible and can fly to the target. Without this, the
                 // spinner hides the triangle and the flight animation is invisible.
@@ -635,7 +650,7 @@ final class CompanionManager: ObservableObject {
                     voiceState = .idle
                 }
 
-                // Pick the screen capture matching Claude's screen number,
+                // Pick the screen capture matching the tagged screen number,
                 // falling back to the cursor screen if not specified.
                 let targetScreenCapture: CompanionScreenCapture? = {
                     if let screenNumber = parseResult.screenNumber,
@@ -647,7 +662,7 @@ final class CompanionManager: ObservableObject {
 
                 if let pointCoordinate = parseResult.coordinate,
                    let targetScreenCapture {
-                    // Claude's coordinates are in the screenshot's pixel space
+                    // The model's coordinates are in the screenshot's pixel space
                     // (top-left origin, e.g. 1280x831). Scale to the display's
                     // point space (e.g. 1512x982), then convert to AppKit global coords.
                     let screenshotWidth = CGFloat(targetScreenCapture.screenshotWidthInPixels)
@@ -767,11 +782,11 @@ final class CompanionManager: ObservableObject {
 
     // MARK: - Point Tag Parsing
 
-    /// Result of parsing a [POINT:...] tag from Claude's response.
+    /// Result of parsing a [POINT:...] tag from the model's response.
     struct PointingParseResult {
         /// The response text with the [POINT:...] tag removed — this is what gets spoken.
         let spokenText: String
-        /// The parsed pixel coordinate, or nil if Claude said "none" or no tag was found.
+        /// The parsed pixel coordinate, or nil if the model said "none" or no tag was found.
         let coordinate: CGPoint?
         /// Short label describing the element (e.g. "run button"), or "none".
         let elementLabel: String?
@@ -779,7 +794,7 @@ final class CompanionManager: ObservableObject {
         let screenNumber: Int?
     }
 
-    /// Parses a [POINT:x,y:label:screenN] or [POINT:none] tag from the end of Claude's response.
+    /// Parses a [POINT:x,y:label:screenN] or [POINT:none] tag from the end of the model's response.
     /// Returns the spoken text (tag removed) and the optional coordinate + label + screen number.
     static func parsePointingCoordinates(from responseText: String) -> PointingParseResult {
         // Match [POINT:none] or [POINT:123,456:label] or [POINT:123,456:label:screen2]
@@ -961,7 +976,7 @@ final class CompanionManager: ObservableObject {
     the screenshot images are labeled with their pixel dimensions. use those dimensions as the coordinate space. origin (0,0) is top-left. x increases rightward, y increases downward.
     """
 
-    /// Captures a screenshot and asks Claude to find something interesting to
+    /// Captures a screenshot and asks the selected model to find something interesting to
     /// point at, then triggers the buddy's flight animation. Used during
     /// onboarding to demo the pointing feature while the intro video plays.
     func performOnboardingDemoInteraction() {
@@ -972,7 +987,7 @@ final class CompanionManager: ObservableObject {
             do {
                 let screenCaptures = try await CompanionScreenCaptureUtility.captureAllScreensAsJPEG()
 
-                // Only send the cursor screen so Claude can't pick something
+                // Only send the cursor screen so the model can't pick something
                 // on a different monitor that we can't point at.
                 guard let cursorScreenCapture = screenCaptures.first(where: { $0.isCursorScreen }) else {
                     print("🎯 Onboarding demo: no cursor screen found")
@@ -982,7 +997,7 @@ final class CompanionManager: ObservableObject {
                 let dimensionInfo = " (image dimensions: \(cursorScreenCapture.screenshotWidthInPixels)x\(cursorScreenCapture.screenshotHeightInPixels) pixels)"
                 let labeledImages = [(data: cursorScreenCapture.imageData, label: cursorScreenCapture.label + dimensionInfo)]
 
-                let (fullResponseText, _) = try await claudeAPI.analyzeImageStreaming(
+                let (fullResponseText, _) = try await chatAPI.analyzeImageStreaming(
                     images: labeledImages,
                     systemPrompt: Self.onboardingDemoSystemPrompt,
                     userPrompt: "look around my screen and find something interesting to point at",
@@ -1012,7 +1027,7 @@ final class CompanionManager: ObservableObject {
                     y: appKitY + displayFrame.origin.y
                 )
 
-                // Set custom bubble text so the pointing animation uses Claude's
+                // Set custom bubble text so the pointing animation uses the model's
                 // comment instead of a random phrase
                 detectedElementBubbleText = parseResult.spokenText
                 detectedElementScreenLocation = globalLocation
diff --git a/leanring-buddy/CompanionPanelView.swift b/leanring-buddy/CompanionPanelView.swift
index 76789b4c..a4c93c2d 100644
--- a/leanring-buddy/CompanionPanelView.swift
+++ b/leanring-buddy/CompanionPanelView.swift
@@ -607,8 +607,9 @@ struct CompanionPanelView: View {
             Spacer()
 
             HStack(spacing: 0) {
-                modelOptionButton(label: "Sonnet", modelID: "claude-sonnet-4-6")
-                modelOptionButton(label: "Opus", modelID: "claude-opus-4-6")
+                ForEach(CompanionChatModelOption.allOptions) { modelOption in
+                    modelOptionButton(label: modelOption.label, modelID: modelOption.id)
+                }
             }
             .background(
                 RoundedRectangle(cornerRadius: 6, style: .continuous)
diff --git a/leanring-buddy/OpenAIAPI.swift b/leanring-buddy/OpenAIAPI.swift
index d0c3f2ae..bc43e43d 100644
--- a/leanring-buddy/OpenAIAPI.swift
+++ b/leanring-buddy/OpenAIAPI.swift
@@ -1,52 +1,41 @@
 //
 //  OpenAIAPI.swift
-//  OpenAI API Implementation
+//  OpenAI Responses API helper
 //
 
 import Foundation
 
-/// OpenAI API helper for vision analysis
 class OpenAIAPI {
     private let apiKey: String
     private let apiURL: URL
     private let model: String
     private let session: URLSession
 
-    init(apiKey: String, model: String = "gpt-5.2-2025-12-11") {
+    init(apiKey: String, model: String = "gpt-5.4") {
         self.apiKey = apiKey
-        self.apiURL = URL(string: "https://api.openai.com/v1/chat/completions")!
+        self.apiURL = URL(string: "https://api.openai.com/v1/responses")!
         self.model = model
 
-        // Use .default instead of .ephemeral so TLS session tickets are cached.
-        // Ephemeral sessions do a full TLS handshake on every request, which causes
-        // transient -1200 (errSSLPeerHandshakeFail) errors with large image payloads.
-        // Disable URL/cookie caching to avoid storing responses or credentials on disk.
-        let config = URLSessionConfiguration.default
-        config.timeoutIntervalForRequest = 120
-        config.timeoutIntervalForResource = 300
-        config.waitsForConnectivity = true
-        config.urlCache = nil
-        config.httpCookieStorage = nil
-        self.session = URLSession(configuration: config)
-
-        // Fire a lightweight HEAD request in the background to pre-establish the TLS
-        // connection. This caches the TLS session ticket so the first real API call
-        // (which carries a large image payload) doesn't need a cold TLS handshake.
+        let configuration = URLSessionConfiguration.default
+        configuration.timeoutIntervalForRequest = 120
+        configuration.timeoutIntervalForResource = 300
+        configuration.waitsForConnectivity = true
+        configuration.urlCache = nil
+        configuration.httpCookieStorage = nil
+        self.session = URLSession(configuration: configuration)
+
         warmUpTLSConnection()
     }
 
-    /// Sends a no-op HEAD request to the API host to establish and cache a TLS session.
-    /// Failures are silently ignored — this is purely an optimization.
     private func warmUpTLSConnection() {
         var warmupRequest = URLRequest(url: apiURL)
         warmupRequest.httpMethod = "HEAD"
         warmupRequest.timeoutInterval = 10
         session.dataTask(with: warmupRequest) { _, _, _ in
-            // Response doesn't matter — the TLS handshake is the goal
+            // Warming the TLS session is enough.
         }.resume()
     }
 
-    /// Send a vision request to OpenAI with one or more labeled images.
     func analyzeImage(
         images: [(data: Data, label: String)],
         systemPrompt: String,
@@ -55,62 +44,70 @@ class OpenAIAPI {
     ) async throws -> (text: String, duration: TimeInterval) {
         let startTime = Date()
 
-        // Build request
         var request = URLRequest(url: apiURL)
         request.httpMethod = "POST"
         request.timeoutInterval = 120
         request.setValue("Bearer \(apiKey)", forHTTPHeaderField: "Authorization")
         request.setValue("application/json", forHTTPHeaderField: "Content-Type")
 
-        // Build messages array
-        var messages: [[String: Any]] = []
-
-        // Add system message first
-        messages.append([
+        var input: [[String: Any]] = [[
             "role": "system",
-            "content": systemPrompt
-        ])
+            "content": [[
+                "type": "input_text",
+                "text": systemPrompt
+            ]]
+        ]]
 
-        // Add conversation history
         for (userPlaceholder, assistantResponse) in conversationHistory {
-            messages.append(["role": "user", "content": userPlaceholder])
-            messages.append(["role": "assistant", "content": assistantResponse])
+            input.append([
+                "role": "user",
+                "content": [[
+                    "type": "input_text",
+                    "text": userPlaceholder
+                ]]
+            ])
+            input.append([
+                "role": "assistant",
+                "content": [[
+                    "type": "input_text",
+                    "text": assistantResponse
+                ]]
+            ])
         }
 
-        // Build current message with all labeled images + prompt
-        var contentBlocks: [[String: Any]] = []
+        var currentMessageContent: [[String: Any]] = []
         for image in images {
-            contentBlocks.append([
-                "type": "text",
+            currentMessageContent.append([
+                "type": "input_text",
                 "text": image.label
             ])
-            contentBlocks.append([
-                "type": "image_url",
-                "image_url": [
-                    "url": "data:image/jpeg;base64,\(image.data.base64EncodedString())"
-                ]
+            currentMessageContent.append([
+                "type": "input_image",
+                "image_url": "data:image/jpeg;base64,\(image.data.base64EncodedString())"
             ])
         }
-        contentBlocks.append([
-            "type": "text",
+        currentMessageContent.append([
+            "type": "input_text",
             "text": userPrompt
         ])
-        messages.append(["role": "user", "content": contentBlocks])
 
-        // Build request body
-        let body: [String: Any] = [
+        input.append([
+            "role": "user",
+            "content": currentMessageContent
+        ])
+
+        let requestBody: [String: Any] = [
             "model": model,
-            // `max_tokens` is deprecated/incompatible for some newer OpenAI models.
-            "max_completion_tokens": 600,
-            "messages": messages
+            "input": input,
+            "max_output_tokens": 600
         ]
 
-        let bodyData = try JSONSerialization.data(withJSONObject: body)
+        let bodyData = try JSONSerialization.data(withJSONObject: requestBody)
         request.httpBody = bodyData
+
         let payloadMB = Double(bodyData.count) / 1_048_576.0
-        print("🌐 OpenAI request: \(String(format: "%.1f", payloadMB))MB, \(images.count) image(s)")
+        print("🌐 OpenAI Responses request: \(String(format: "%.1f", payloadMB))MB, \(images.count) image(s)")
 
-        // Send request
         let (data, response) = try await session.data(for: request)
 
         guard let httpResponse = response as? HTTPURLResponse,
@@ -123,19 +120,27 @@ class OpenAIAPI {
             )
         }
 
-        // Parse response
         let json = try JSONSerialization.jsonObject(with: data) as? [String: Any]
-        guard let choices = json?["choices"] as? [[String: Any]],
-              let firstChoice = choices.first,
-              let message = firstChoice["message"] as? [String: Any],
-              let text = message["content"] as? String else {
-            throw NSError(
-                domain: "OpenAIAPI",
-                code: -1,
-                userInfo: [NSLocalizedDescriptionKey: "Invalid response format"]
-            )
+
+        if let outputText = json?["output_text"] as? String,
+           !outputText.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty {
+            let duration = Date().timeIntervalSince(startTime)
+            return (text: outputText, duration: duration)
         }
 
+        let outputItems = json?["output"] as? [[String: Any]] ?? []
+        let text = outputItems
+            .flatMap { outputItem in
+                outputItem["content"] as? [[String: Any]] ?? []
+            }
+            .compactMap { contentItem -> String? in
+                guard let type = contentItem["type"] as? String, type == "output_text" else {
+                    return nil
+                }
+                return contentItem["text"] as? String
+            }
+            .joined()
+
         let duration = Date().timeIntervalSince(startTime)
         return (text: text, duration: duration)
     }
diff --git a/worker/src/index.ts b/worker/src/index.ts
index 2e3e9345..031e89da 100644
--- a/worker/src/index.ts
+++ b/worker/src/index.ts
@@ -1,25 +1,48 @@
 /**
  * Clicky Proxy Worker
  *
- * Proxies requests to Claude and ElevenLabs APIs so the app never
- * ships with raw API keys. Keys are stored as Cloudflare secrets.
- *
- * Routes:
- *   POST /chat  → Anthropic Messages API (streaming)
- *   POST /tts   → ElevenLabs TTS API
+ * Normalizes Claude, OpenAI, and Gemini multimodal chat into a single
+ * SSE response shape that the macOS client already knows how to parse.
  */
 
 interface Env {
   ANTHROPIC_API_KEY: string;
+  OPENAI_API_KEY: string;
+  GEMINI_API_KEY: string;
   ELEVENLABS_API_KEY: string;
   ELEVENLABS_VOICE_ID: string;
   ASSEMBLYAI_API_KEY: string;
 }
 
+interface ChatConversationTurn {
+  user_transcript: string;
+  assistant_response: string;
+}
+
+interface ChatImageInput {
+  media_type: string;
+  data: string;
+  label: string;
+}
+
+interface ChatRequestBody {
+  model: string;
+  system_prompt: string;
+  conversation_history?: ChatConversationTurn[];
+  images?: ChatImageInput[];
+  user_prompt: string;
+}
+
+type ChatProvider = "anthropic" | "openai" | "gemini";
+
 export default {
   async fetch(request: Request, env: Env): Promise<Response> {
     const url = new URL(request.url);
 
+    if (request.method === "HEAD") {
+      return new Response(null, { status: 200 });
+    }
+
     if (request.method !== "POST") {
       return new Response("Method not allowed", { status: 405 });
     }
@@ -38,10 +61,10 @@ export default {
       }
     } catch (error) {
       console.error(`[${url.pathname}] Unhandled error:`, error);
-      return new Response(
-        JSON.stringify({ error: String(error) }),
-        { status: 500, headers: { "content-type": "application/json" } }
-      );
+      return new Response(JSON.stringify({ error: String(error) }), {
+        status: 500,
+        headers: { "content-type": "application/json" },
+      });
     }
 
     return new Response("Not found", { status: 404 });
@@ -49,7 +72,93 @@ export default {
 };
 
 async function handleChat(request: Request, env: Env): Promise<Response> {
-  const body = await request.text();
+  const requestBody = (await request.json()) as ChatRequestBody;
+  validateChatRequestBody(requestBody);
+
+  const provider = resolveChatProvider(requestBody.model);
+  let responseText = "";
+
+  if (provider === "anthropic") {
+    responseText = await requestAnthropicChat(requestBody, env);
+  } else if (provider === "openai") {
+    responseText = await requestOpenAIChat(requestBody, env);
+  } else {
+    responseText = await requestGeminiChat(requestBody, env);
+  }
+
+  return makeNormalizedSSEResponse(responseText);
+}
+
+function validateChatRequestBody(requestBody: ChatRequestBody): void {
+  if (!requestBody.model?.trim()) {
+    throw new Error("Missing chat model.");
+  }
+
+  if (!requestBody.system_prompt?.trim()) {
+    throw new Error("Missing system prompt.");
+  }
+
+  if (!requestBody.user_prompt?.trim()) {
+    throw new Error("Missing user prompt.");
+  }
+}
+
+function resolveChatProvider(model: string): ChatProvider {
+  if (model.startsWith("claude-")) {
+    return "anthropic";
+  }
+
+  if (model.startsWith("gpt-")) {
+    return "openai";
+  }
+
+  if (model.startsWith("gemini-")) {
+    return "gemini";
+  }
+
+  throw new Error(`Unsupported chat model: ${model}`);
+}
+
+async function requestAnthropicChat(requestBody: ChatRequestBody, env: Env): Promise<string> {
+  const messages: Array<Record<string, unknown>> = [];
+
+  for (const conversationTurn of requestBody.conversation_history ?? []) {
+    messages.push({
+      role: "user",
+      content: conversationTurn.user_transcript,
+    });
+    messages.push({
+      role: "assistant",
+      content: conversationTurn.assistant_response,
+    });
+  }
+
+  const currentContentBlocks: Array<Record<string, unknown>> = [];
+
+  for (const image of requestBody.images ?? []) {
+    currentContentBlocks.push({
+      type: "image",
+      source: {
+        type: "base64",
+        media_type: image.media_type,
+        data: image.data,
+      },
+    });
+    currentContentBlocks.push({
+      type: "text",
+      text: image.label,
+    });
+  }
+
+  currentContentBlocks.push({
+    type: "text",
+    text: requestBody.user_prompt,
+  });
+
+  messages.push({
+    role: "user",
+    content: currentContentBlocks,
+  });
 
   const response = await fetch("https://api.anthropic.com/v1/messages", {
     method: "POST",
@@ -58,23 +167,240 @@ async function handleChat(request: Request, env: Env): Promise<Response> {
       "anthropic-version": "2023-06-01",
       "content-type": "application/json",
     },
-    body,
+    body: JSON.stringify({
+      model: requestBody.model,
+      max_tokens: 1024,
+      system: requestBody.system_prompt,
+      messages,
+    }),
   });
 
   if (!response.ok) {
     const errorBody = await response.text();
     console.error(`[/chat] Anthropic API error ${response.status}: ${errorBody}`);
-    return new Response(errorBody, {
-      status: response.status,
-      headers: { "content-type": "application/json" },
+    throw new Error(`Anthropic API error ${response.status}: ${errorBody}`);
+  }
+
+  const responseJson = (await response.json()) as {
+    content?: Array<{ type?: string; text?: string }>;
+  };
+
+  const responseText = (responseJson.content ?? [])
+    .filter((contentBlock) => contentBlock.type === "text")
+    .map((contentBlock) => contentBlock.text ?? "")
+    .join("");
+
+  return responseText.trim();
+}
+
+async function requestOpenAIChat(requestBody: ChatRequestBody, env: Env): Promise<string> {
+  const input: Array<Record<string, unknown>> = [
+    {
+      role: "system",
+      content: [
+        {
+          type: "input_text",
+          text: requestBody.system_prompt,
+        },
+      ],
+    },
+  ];
+
+  for (const conversationTurn of requestBody.conversation_history ?? []) {
+    input.push({
+      role: "user",
+      content: [
+        {
+          type: "input_text",
+          text: conversationTurn.user_transcript,
+        },
+      ],
+    });
+    input.push({
+      role: "assistant",
+      content: [
+        {
+          type: "input_text",
+          text: conversationTurn.assistant_response,
+        },
+      ],
     });
   }
 
-  return new Response(response.body, {
-    status: response.status,
+  const currentContentBlocks: Array<Record<string, unknown>> = [];
+
+  for (const image of requestBody.images ?? []) {
+    currentContentBlocks.push({
+      type: "input_text",
+      text: image.label,
+    });
+    currentContentBlocks.push({
+      type: "input_image",
+      image_url: `data:${image.media_type};base64,${image.data}`,
+    });
+  }
+
+  currentContentBlocks.push({
+    type: "input_text",
+    text: requestBody.user_prompt,
+  });
+
+  input.push({
+    role: "user",
+    content: currentContentBlocks,
+  });
+
+  const response = await fetch("https://api.openai.com/v1/responses", {
+    method: "POST",
+    headers: {
+      Authorization: `Bearer ${env.OPENAI_API_KEY}`,
+      "Content-Type": "application/json",
+    },
+    body: JSON.stringify({
+      model: requestBody.model,
+      input,
+      max_output_tokens: 1024,
+    }),
+  });
+
+  if (!response.ok) {
+    const errorBody = await response.text();
+    console.error(`[/chat] OpenAI API error ${response.status}: ${errorBody}`);
+    throw new Error(`OpenAI API error ${response.status}: ${errorBody}`);
+  }
+
+  const responseJson = (await response.json()) as {
+    output_text?: string;
+    output?: Array<{
+      content?: Array<{ type?: string; text?: string }>;
+    }>;
+  };
+
+  if (responseJson.output_text?.trim()) {
+    return responseJson.output_text.trim();
+  }
+
+  const responseText = (responseJson.output ?? [])
+    .flatMap((outputItem) => outputItem.content ?? [])
+    .filter((contentItem) => contentItem.type === "output_text")
+    .map((contentItem) => contentItem.text ?? "")
+    .join("");
+
+  return responseText.trim();
+}
+
+async function requestGeminiChat(requestBody: ChatRequestBody, env: Env): Promise<string> {
+  const contents: Array<Record<string, unknown>> = [];
+
+  for (const conversationTurn of requestBody.conversation_history ?? []) {
+    contents.push({
+      role: "user",
+      parts: [
+        {
+          text: conversationTurn.user_transcript,
+        },
+      ],
+    });
+    contents.push({
+      role: "model",
+      parts: [
+        {
+          text: conversationTurn.assistant_response,
+        },
+      ],
+    });
+  }
+
+  const currentParts: Array<Record<string, unknown>> = [];
+
+  for (const image of requestBody.images ?? []) {
+    currentParts.push({
+      text: image.label,
+    });
+    currentParts.push({
+      inline_data: {
+        mime_type: image.media_type,
+        data: image.data,
+      },
+    });
+  }
+
+  currentParts.push({
+    text: requestBody.user_prompt,
+  });
+
+  contents.push({
+    role: "user",
+    parts: currentParts,
+  });
+
+  const response = await fetch(
+    `https://generativelanguage.googleapis.com/v1beta/models/${encodeURIComponent(requestBody.model)}:generateContent`,
+    {
+      method: "POST",
+      headers: {
+        "x-goog-api-key": env.GEMINI_API_KEY,
+        "Content-Type": "application/json",
+      },
+      body: JSON.stringify({
+        system_instruction: {
+          parts: [
+            {
+              text: requestBody.system_prompt,
+            },
+          ],
+        },
+        contents,
+      }),
+    }
+  );
+
+  if (!response.ok) {
+    const errorBody = await response.text();
+    console.error(`[/chat] Gemini API error ${response.status}: ${errorBody}`);
+    throw new Error(`Gemini API error ${response.status}: ${errorBody}`);
+  }
+
+  const responseJson = (await response.json()) as {
+    candidates?: Array<{
+      content?: {
+        parts?: Array<{ text?: string }>;
+      };
+    }>;
+  };
+
+  const responseText = (responseJson.candidates ?? [])
+    .flatMap((candidate) => candidate.content?.parts ?? [])
+    .map((part) => part.text ?? "")
+    .join("");
+
+  return responseText.trim();
+}
+
+function makeNormalizedSSEResponse(responseText: string): Response {
+  const encoder = new TextEncoder();
+  const normalizedChunk = JSON.stringify({
+    type: "content_block_delta",
+    delta: {
+      type: "text_delta",
+      text: responseText,
+    },
+  });
+
+  const stream = new ReadableStream<Uint8Array>({
+    start(controller) {
+      controller.enqueue(encoder.encode(`data: ${normalizedChunk}\n\n`));
+      controller.enqueue(encoder.encode("data: [DONE]\n\n"));
+      controller.close();
+    },
+  });
+
+  return new Response(stream, {
+    status: 200,
     headers: {
-      "content-type": response.headers.get("content-type") || "text/event-stream",
+      "content-type": "text/event-stream; charset=utf-8",
       "cache-control": "no-cache",
+      connection: "keep-alive",
     },
   });
 }
@@ -110,18 +436,15 @@ async function handleTTS(request: Request, env: Env): Promise<Response> {
   const body = await request.text();
   const voiceId = env.ELEVENLABS_VOICE_ID;
 
-  const response = await fetch(
-    `https://api.elevenlabs.io/v1/text-to-speech/${voiceId}`,
-    {
-      method: "POST",
-      headers: {
-        "xi-api-key": env.ELEVENLABS_API_KEY,
-        "content-type": "application/json",
-        accept: "audio/mpeg",
-      },
-      body,
-    }
-  );
+  const response = await fetch(`https://api.elevenlabs.io/v1/text-to-speech/${voiceId}`, {
+    method: "POST",
+    headers: {
+      "xi-api-key": env.ELEVENLABS_API_KEY,
+      "content-type": "application/json",
+      accept: "audio/mpeg",
+    },
+    body,
+  });
 
   if (!response.ok) {
     const errorBody = await response.text();