diff --git a/TargetBridge-Receiver/TBReceiverC/src/display.c b/TargetBridge-Receiver/TBReceiverC/src/display.c index 5e11238..d753946 100644 --- a/TargetBridge-Receiver/TBReceiverC/src/display.c +++ b/TargetBridge-Receiver/TBReceiverC/src/display.c @@ -241,7 +241,7 @@ struct tb_display *tb_disp_create(int fullscreen) { * anisotropic where supported. Must be set BEFORE renderer creation. */ SDL_SetHint(SDL_HINT_RENDER_SCALE_QUALITY, "best"); - if (SDL_Init(SDL_INIT_VIDEO) < 0) { + if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_AUDIO) < 0) { fprintf(stderr, "[disp] SDL_Init: %s\n", SDL_GetError()); return NULL; } diff --git a/TargetBridge-Receiver/TBReceiverC/src/main.c b/TargetBridge-Receiver/TBReceiverC/src/main.c index bc09439..52ddfc9 100644 --- a/TargetBridge-Receiver/TBReceiverC/src/main.c +++ b/TargetBridge-Receiver/TBReceiverC/src/main.c @@ -30,6 +30,8 @@ #include #include +#define AUDIO_BUF_CAP (192000) // 1 second buffer of 48000Hz stereo 16-bit PCM + struct app { struct tb_display *disp; struct tb_decoder *dec; @@ -53,6 +55,13 @@ struct app { DNSServiceRef bonjour_ref; char bonjour_name[128]; + + SDL_AudioDeviceID audio_device; + + uint8_t audio_buf[AUDIO_BUF_CAP]; + int audio_buf_head; + int audio_buf_tail; + int audio_buf_size; }; static volatile sig_atomic_t g_term = 0; @@ -221,6 +230,29 @@ static void on_frame(const uint8_t *y, int y_stride, a->frames++; } +static void ring_read(struct app *a, Uint8 *dst, int len) { + int first = AUDIO_BUF_CAP - a->audio_buf_tail; + if (first >= len) { + memcpy(dst, a->audio_buf + a->audio_buf_tail, len); + } else { + memcpy(dst, a->audio_buf + a->audio_buf_tail, first); + memcpy(dst + first, a->audio_buf, len - first); + } + a->audio_buf_tail = (a->audio_buf_tail + len) % AUDIO_BUF_CAP; + a->audio_buf_size -= len; +} + +static void audio_callback(void *userdata, Uint8 *stream, int len) { + struct app *a = (struct app *)userdata; + if (a->audio_buf_size >= len) { + ring_read(a, stream, len); + } else { + int available = a->audio_buf_size; + if (available > 0) ring_read(a, stream, available); + memset(stream + available, 0, len - available); + } +} + /* ---- Callbacks: parser → decoder ------------------------------------- */ static void on_packet(uint8_t type, const uint8_t *payload, size_t len, void *ud) { @@ -290,6 +322,35 @@ static void on_packet(uint8_t type, const uint8_t *payload, size_t len, void *ud tb_disp_set_cursor(a->disp, x, y, w, h, visible, type); } break; + case TB_PKT_AUDIO_FRAME: + if (a->audio_device != 0) { + SDL_LockAudioDevice(a->audio_device); + + // Limit audio backlog to 150ms (150 * 192 = 28800 bytes) to cushion against network/scheduling jitter. + // If the buffer would exceed this, smoothly discard the oldest excess bytes. + const int cap_bytes = 28800; + if (a->audio_buf_size + len > cap_bytes) { + int excess = (a->audio_buf_size + len) - cap_bytes; + a->audio_buf_tail = (a->audio_buf_tail + excess) % AUDIO_BUF_CAP; + a->audio_buf_size -= excess; + } + + // Write payload to circular buffer + if (a->audio_buf_size + (int)len <= AUDIO_BUF_CAP) { + int first = AUDIO_BUF_CAP - a->audio_buf_head; + if (first >= (int)len) { + memcpy(a->audio_buf + a->audio_buf_head, payload, len); + } else { + memcpy(a->audio_buf + a->audio_buf_head, payload, first); + memcpy(a->audio_buf, payload + first, len - first); + } + a->audio_buf_head = (a->audio_buf_head + (int)len) % AUDIO_BUF_CAP; + a->audio_buf_size += (int)len; + } + + SDL_UnlockAudioDevice(a->audio_device); + } + break; case TB_PKT_HEARTBEAT: break; case TB_PKT_TEST_DATA: @@ -308,11 +369,13 @@ static void on_packet(uint8_t type, const uint8_t *payload, size_t len, void *ud /* ---- Networking helpers ---------------------------------------------- */ -static int drain_socket(struct app *a) { +static int drain_socket(struct app *a, int *bytes_read) { uint8_t buf[1024 * 1024]; + if (bytes_read) *bytes_read = 0; for (;;) { ssize_t n = read(a->client_fd, buf, sizeof(buf)); if (n > 0) { + if (bytes_read) *bytes_read += n; if (tb_parser_feed(&a->parser, buf, (size_t)n) < 0) return -1; } else if (n == 0) { return -1; /* peer closed */ @@ -422,6 +485,14 @@ static void close_client(struct app *a) { tb_parser_free(&a->parser); tb_parser_init(&a->parser, on_packet, a); tb_dec_reset(a->dec); /* fresh decoder for next session */ + if (a->audio_device != 0) { + SDL_LockAudioDevice(a->audio_device); + a->audio_buf_head = 0; + a->audio_buf_tail = 0; + a->audio_buf_size = 0; + SDL_UnlockAudioDevice(a->audio_device); + } + fprintf(stderr, "[main] client disconnected\n"); } @@ -464,6 +535,24 @@ int main(int argc, char **argv) { a.disp = tb_disp_create(fullscreen); if (!a.disp) { fprintf(stderr, "tb_disp_create failed\n"); return 1; } + /* Open SDL Audio Device */ + SDL_AudioSpec spec; + SDL_zero(spec); + spec.freq = 48000; + spec.format = AUDIO_S16LSB; // 16-bit signed, little-endian PCM + spec.channels = 2; // Stereo + spec.samples = 1024; // Buffer size (approx 21.3ms) + spec.callback = audio_callback; + spec.userdata = &a; + SDL_AudioSpec obtained; + a.audio_device = SDL_OpenAudioDevice(NULL, 0, &spec, &obtained, 0); + if (a.audio_device != 0) { + SDL_PauseAudioDevice(a.audio_device, 0); // Start playing (unpaused) + fprintf(stderr, "[main] SDL audio device opened: 48000Hz stereo 16-bit PCM (obtained %d samples)\n", obtained.samples); + } else { + fprintf(stderr, "[main] warning: SDL_OpenAudioDevice failed: %s\n", SDL_GetError()); + } + struct tb_display_info boot_info; if (tb_disp_get_info(a.disp, &boot_info) == 0) { snprintf(a.panel_text, sizeof(a.panel_text), "%u x %u px (%s)", @@ -486,6 +575,7 @@ int main(int argc, char **argv) { while (!g_term && !tb_disp_poll_quit(a.disp)) { uint64_t t = now_ms(); + int bytes_read = 0; if (t - a.last_ip_check_ms >= 1000) { char refreshed_ip[64] = {0}; @@ -511,7 +601,7 @@ int main(int argc, char **argv) { send_receiver_info(&a); } } else { - if (drain_socket(&a) < 0) close_client(&a); + if (drain_socket(&a, &bytes_read) < 0) close_client(&a); else if (a.close_requested) close_client(&a); } @@ -528,8 +618,9 @@ int main(int argc, char **argv) { } /* Yield only while idle. During active video, keep draining and - * rendering without injecting an extra millisecond of latency. */ - if (a.client_fd < 0 || !a.have_video_frame) { + * rendering without injecting an extra millisecond of latency. + * If we didn't read any bytes from the socket, we can safely yield 1ms. */ + if (a.client_fd < 0 || !a.have_video_frame || bytes_read == 0) { SDL_Delay(1); } } @@ -539,6 +630,9 @@ int main(int argc, char **argv) { bonjour_deinit(&a); tb_parser_free(&a.parser); tb_dec_destroy(a.dec); + if (a.audio_device != 0) { + SDL_CloseAudioDevice(a.audio_device); + } tb_disp_destroy(a.disp); fprintf(stderr, "[main] bye\n"); return 0; diff --git a/TargetBridge-Receiver/TBReceiverC/src/proto.h b/TargetBridge-Receiver/TBReceiverC/src/proto.h index f2d4124..75c853c 100644 --- a/TargetBridge-Receiver/TBReceiverC/src/proto.h +++ b/TargetBridge-Receiver/TBReceiverC/src/proto.h @@ -30,6 +30,7 @@ #define TB_PKT_CREATE_SESSION_ACK 0x12 #define TB_PKT_PARAM_SETS 0x20 #define TB_PKT_FRAME 0x21 +#define TB_PKT_AUDIO_FRAME 0x23 #define TB_PKT_HEARTBEAT 0x30 #define TB_PKT_TEARDOWN 0x31 #define TB_PKT_CURSOR 0x32 diff --git a/TargetBridge-Sender/TBDisplaySender/TBDisplaySenderContentView.swift b/TargetBridge-Sender/TBDisplaySender/TBDisplaySenderContentView.swift index 39d0702..d44f88f 100644 --- a/TargetBridge-Sender/TBDisplaySender/TBDisplaySenderContentView.swift +++ b/TargetBridge-Sender/TBDisplaySender/TBDisplaySenderContentView.swift @@ -163,6 +163,9 @@ struct TBDisplaySenderContentView: View { Toggle(TBDisplaySenderL10n.largeCursor(service.language), isOn: $service.largeCursor) .disabled(service.anyConnected) + + Toggle(TBDisplaySenderL10n.streamAudio(service.language), isOn: $service.audioEnabled) + .disabled(service.anyConnected) } } } @@ -266,6 +269,11 @@ private struct TBDisplaySenderSessionCard: View { .disabled(session.isConnected || session.isStreaming) } + controlRow(TBDisplaySenderL10n.streamAudio(service.language)) { + Toggle("", isOn: $session.audioEnabled) + .labelsHidden() + } + VStack(alignment: .leading, spacing: 4) { Text(TBDisplaySenderL10n.streamHint1(service.language)) Text(TBDisplaySenderL10n.streamHint2(service.language)) diff --git a/TargetBridge-Sender/TBDisplaySender/TBDisplaySenderLocalization.swift b/TargetBridge-Sender/TBDisplaySender/TBDisplaySenderLocalization.swift index 18fa916..fa92e90 100644 --- a/TargetBridge-Sender/TBDisplaySender/TBDisplaySenderLocalization.swift +++ b/TargetBridge-Sender/TBDisplaySender/TBDisplaySenderLocalization.swift @@ -480,6 +480,14 @@ enum TBDisplaySenderL10n { } } + static func streamAudio(_ language: TBDisplaySenderLanguage) -> String { + switch language { + case .italian: return "Trasmetti audio del Mac" + case .english: return "Stream Mac audio" + case .german: return "Mac-Audio übertragen" + } + } + static func showMainWindow(_ language: TBDisplaySenderLanguage) -> String { switch language { case .italian: return "Mostra finestra principale" diff --git a/TargetBridge-Sender/TBDisplaySender/TBDisplaySenderManager.swift b/TargetBridge-Sender/TBDisplaySender/TBDisplaySenderManager.swift index 929d4c2..72177f0 100644 --- a/TargetBridge-Sender/TBDisplaySender/TBDisplaySenderManager.swift +++ b/TargetBridge-Sender/TBDisplaySender/TBDisplaySenderManager.swift @@ -31,6 +31,12 @@ final class TBDisplaySenderService: ObservableObject { objectWillChange.send() } } + @Published var audioEnabled: Bool = UserDefaults.standard.object(forKey: "fd.tbdisplaysender.audioEnabled") as? Bool ?? true { + didSet { + UserDefaults.standard.set(audioEnabled, forKey: "fd.tbdisplaysender.audioEnabled") + objectWillChange.send() + } + } private var sessionCancellables: [UUID: AnyCancellable] = [:] private let receiverDiscovery = TBReceiverDiscovery() @@ -70,7 +76,7 @@ final class TBDisplaySenderService: ObservableObject { } func addSession() { - let session = TBDisplaySenderSession(language: language, largeCursor: largeCursor) + let session = TBDisplaySenderSession(language: language, largeCursor: largeCursor, audioEnabled: audioEnabled) if let previous = sessions.last { session.capturePreset = previous.capturePreset session.captureSource = previous.captureSource diff --git a/TargetBridge-Sender/TBDisplaySender/TBDisplaySenderService.swift b/TargetBridge-Sender/TBDisplaySender/TBDisplaySenderService.swift index 5238797..3cad571 100644 --- a/TargetBridge-Sender/TBDisplaySender/TBDisplaySenderService.swift +++ b/TargetBridge-Sender/TBDisplaySender/TBDisplaySenderService.swift @@ -4,6 +4,7 @@ import CoreMedia import CoreVideo import Darwin import Foundation +import AVFoundation import IOSurface import Network @preconcurrency import ScreenCaptureKit @@ -111,7 +112,7 @@ enum TBDisplayCapturePreset: String, CaseIterable, Identifiable { if let envVal = ProcessInfo.processInfo.environment["QD"], let parsed = Int(envVal) { return parsed } - return 2 + return 6 } var expectedFrameRate: Int { @@ -170,7 +171,7 @@ enum TBDisplayCapturePreset: String, CaseIterable, Identifiable { if let envVal = ProcessInfo.processInfo.environment["MPVP"], let parsed = Int(envVal) { return parsed } - return 3 + return 10 } var maxFrameDelayCount: Int { @@ -195,7 +196,7 @@ enum TBDisplayCapturePreset: String, CaseIterable, Identifiable { if let envVal = ProcessInfo.processInfo.environment["MIFEF"], let parsed = Int(envVal) { return parsed } - return 5 + return 6 } var captureResolution: SCCaptureResolutionType { @@ -306,6 +307,7 @@ private final class TBDirectDisplayStreamCapture { @MainActor final class TBDisplaySenderSession: NSObject, ObservableObject, Identifiable, @unchecked Sendable { private static let receiverIPDefaultsKey = "fd.tbdisplaysender.receiverIP" + private static let audioEnabledDefaultsKey = "fd.tbdisplaysender.audioEnabled" private struct SavedExtendedDisplayArrangement { let x: Int32 let y: Int32 @@ -394,12 +396,13 @@ final class TBDisplaySenderSession: NSObject, ObservableObject, Identifiable, @u let id = UUID() - init(language: TBDisplaySenderLanguage, largeCursor: Bool) { + init(language: TBDisplaySenderLanguage, largeCursor: Bool, audioEnabled: Bool) { self.statusText = TBDisplaySenderStatusState.ready.text(language) self.receiverPanelText = TBDisplaySenderL10n.waitingReceiverProfile(language) self.virtualDisplayText = TBDisplaySenderL10n.virtualDisplayNotCreated(language) self.language = language self.largeCursor = largeCursor + self.audioEnabled = audioEnabled self.streamResolutionText = TBDisplaySenderL10n.streamSummary( preset: .standard1440p, source: .desktopMirror, @@ -421,6 +424,7 @@ final class TBDisplaySenderSession: NSObject, ObservableObject, Identifiable, @u UserDefaults.standard.set(receiverIP, forKey: Self.receiverIPDefaultsKey) } } + @Published var audioEnabled: Bool @Published var senderFPS = 0 @Published var receiverPanelText: String @Published var virtualDisplayText: String @@ -453,6 +457,7 @@ final class TBDisplaySenderSession: NSObject, ObservableObject, Identifiable, @u private var recvBuffer = Data() private var session = ReceiverBackedVirtualDisplaySession() + private let audioConverter = SBAudioConverter() private var activeProfile: TBMonitorDisplayProfile? private var captureDelegate: CaptureDelegate? @@ -480,6 +485,7 @@ final class TBDisplaySenderSession: NSObject, ObservableObject, Identifiable, @u private final class CaptureDelegate: NSObject, SCStreamOutput, SCStreamDelegate { var onFrame: ((CMSampleBuffer) -> Void)? + var onAudio: ((CMSampleBuffer) -> Void)? var onError: ((Error) -> Void)? private static func shouldProcessFrame(_ sampleBuffer: CMSampleBuffer) -> Bool { @@ -504,6 +510,10 @@ final class TBDisplaySenderSession: NSObject, ObservableObject, Identifiable, @u nonisolated func stream(_ stream: SCStream, didOutputSampleBuffer sampleBuffer: CMSampleBuffer, of type: SCStreamOutputType) { + if type == .audio { + onAudio?(sampleBuffer) + return + } guard type == .screen else { return } guard Self.shouldProcessFrame(sampleBuffer) else { return } onFrame?(sampleBuffer) @@ -1000,15 +1010,25 @@ final class TBDisplaySenderSession: NSObject, ObservableObject, Identifiable, @u do { let preset = capturePreset - if captureSource == .extendedDesktop, session.displayID != kCGNullDirectDisplay { - if startDirectDisplayStream(displayID: session.displayID, preset: preset) { - return true + let display: SCDisplay + if captureSource == .desktopMirror { + // In mirror mode, the virtual display mirrors the main display. + // We capture the main display directly via ScreenCaptureKit to stream both video and system audio. + let content = try await SCShareableContent.excludingDesktopWindows(false, onScreenWindowsOnly: false) + guard let mainDisplay = content.displays.first(where: { $0.displayID == CGMainDisplayID() }) else { + return false + } + display = mainDisplay + } else { + // In extended desktop mode, we capture the virtual display using ScreenCaptureKit + // to support both video and high-fidelity system audio. + let content = try await SCShareableContent.excludingDesktopWindows(false, onScreenWindowsOnly: false) + if session.displayID != kCGNullDirectDisplay, + let targetDisplay = content.displays.first(where: { $0.displayID == session.displayID }) { + display = targetDisplay + } else { + display = try await waitForCaptureDisplay() } - } - - let display = try await waitForCaptureDisplay() - if startDirectDisplayStream(displayID: display.displayID, preset: preset) { - return true } let configuration = SCStreamConfiguration() @@ -1020,6 +1040,10 @@ final class TBDisplaySenderSession: NSObject, ObservableObject, Identifiable, @u configuration.showsCursor = !largeCursor configuration.scalesToFit = true configuration.captureResolution = preset.captureResolution + configuration.capturesAudio = true + configuration.excludesCurrentProcessAudio = true + configuration.sampleRate = 48000 + configuration.channelCount = 2 setupEncoder( width: preset.width, @@ -1034,6 +1058,9 @@ final class TBDisplaySenderSession: NSObject, ObservableObject, Identifiable, @u delegate.onFrame = { [weak self] sampleBuffer in self?.encode(sampleBuffer) } + delegate.onAudio = { [weak self] sampleBuffer in + self?.processAudio(sampleBuffer) + } delegate.onError = { [weak self] error in Task { @MainActor [weak self] in guard let self else { return } @@ -1051,6 +1078,11 @@ final class TBDisplaySenderSession: NSObject, ObservableObject, Identifiable, @u type: .screen, sampleHandlerQueue: DispatchQueue(label: "fd.tbmonitor.sender.capture", qos: .userInteractive) ) + try stream.addStreamOutput( + delegate, + type: .audio, + sampleHandlerQueue: DispatchQueue(label: "fd.tbmonitor.sender.audio", qos: .userInteractive) + ) try await stream.startCapture() scStream = stream isStreaming = true @@ -1683,8 +1715,137 @@ final class TBDisplaySenderSession: NSObject, ObservableObject, Identifiable, @u } } + private func processAudio(_ sampleBuffer: CMSampleBuffer) { + guard audioEnabled else { return } + guard let data = audioConverter.convert(sampleBuffer: sampleBuffer) else { return } + let packet = TBMonitorProtocol.makePacket(type: .audioFrame, payload: data) + send(packet) + } + private func send(_ packet: Data) { connection?.send(content: packet, completion: .contentProcessed({ _ in })) } } + +private final class SBAudioConverter: Sendable { + private let converterState: LockedConverterState = LockedConverterState() + + private final class LockedConverterState: @unchecked Sendable { + private let lock = NSLock() + var converter: AVAudioConverter? + var inputFormat: AVAudioFormat? + let outputFormat: AVAudioFormat + + init() { + var asbd = AudioStreamBasicDescription( + mSampleRate: 48000.0, + mFormatID: kAudioFormatLinearPCM, + mFormatFlags: kAudioFormatFlagIsSignedInteger | kAudioFormatFlagIsPacked, + mBytesPerPacket: 4, + mFramesPerPacket: 1, + mBytesPerFrame: 4, + mChannelsPerFrame: 2, + mBitsPerChannel: 16, + mReserved: 0 + ) + self.outputFormat = AVAudioFormat(streamDescription: &asbd)! + } + + func convert(sampleBuffer: CMSampleBuffer) -> Data? { + lock.lock() + defer { lock.unlock() } + + guard let formatDesc = CMSampleBufferGetFormatDescription(sampleBuffer) else { return nil } + guard let asbdPointer = CMAudioFormatDescriptionGetStreamBasicDescription(formatDesc) else { return nil } + let inputASBD = asbdPointer.pointee + + // Recreate converter if input format changes + if inputFormat == nil || + inputFormat!.streamDescription.pointee.mFormatFlags != inputASBD.mFormatFlags || + inputFormat!.streamDescription.pointee.mSampleRate != inputASBD.mSampleRate || + inputFormat!.streamDescription.pointee.mChannelsPerFrame != inputASBD.mChannelsPerFrame { + var mutableASBD = inputASBD + guard let inFormat = AVAudioFormat(streamDescription: &mutableASBD) else { return nil } + self.inputFormat = inFormat + self.converter = AVAudioConverter(from: inFormat, to: outputFormat) + } + + guard let converter = self.converter, let inFormat = self.inputFormat else { return nil } + + let frameCount = sampleBuffer.numSamples + guard frameCount > 0 else { return nil } + let audioFrameCount = AVAudioFrameCount(frameCount) + + // Create input buffer + guard let inputBuffer = AVAudioPCMBuffer(pcmFormat: inFormat, frameCapacity: audioFrameCount) else { return nil } + inputBuffer.frameLength = audioFrameCount + + // Extract audio data from sampleBuffer into inputBuffer + let channelCount = Int(inFormat.channelCount) + let bufferListSize = MemoryLayout.size + (channelCount - 1) * MemoryLayout.size + let bufferListRaw = UnsafeMutableRawPointer.allocate(byteCount: bufferListSize, alignment: MemoryLayout.alignment) + defer { bufferListRaw.deallocate() } + + let ablPointer = bufferListRaw.assumingMemoryBound(to: AudioBufferList.self) + var blockBuffer: CMBlockBuffer? + + let status = CMSampleBufferGetAudioBufferListWithRetainedBlockBuffer( + sampleBuffer, + bufferListSizeNeededOut: nil, + bufferListOut: ablPointer, + bufferListSize: bufferListSize, + blockBufferAllocator: nil, + blockBufferMemoryAllocator: nil, + flags: kCMSampleBufferFlag_AudioBufferList_Assure16ByteAlignment, + blockBufferOut: &blockBuffer + ) + + guard status == noErr else { return nil } + + let firstBufferPtr = withUnsafeMutablePointer(to: &ablPointer.pointee.mBuffers) { $0 } + let buffers = UnsafeBufferPointer(start: firstBufferPtr, count: channelCount) + + if inFormat.isInterleaved { + // ScreenCaptureKit always delivers float32 non-interleaved audio. + // This branch should never execute; assert to catch unexpected format changes. + assertionFailure("SBAudioConverter: unexpected interleaved input format from ScreenCaptureKit") + return nil + } else { + for i in 0.. Data? { + return converterState.convert(sampleBuffer: sampleBuffer) + } +} diff --git a/TargetBridge-Sender/TBDisplayShared/TBMonitorProtocol.swift b/TargetBridge-Sender/TBDisplayShared/TBMonitorProtocol.swift index ec93814..d1a5bf2 100644 --- a/TargetBridge-Sender/TBDisplayShared/TBMonitorProtocol.swift +++ b/TargetBridge-Sender/TBDisplayShared/TBMonitorProtocol.swift @@ -6,6 +6,7 @@ enum TBMonitorPacketType: UInt8 { case createSessionAck = 0x12 case paramSets = 0x20 case frame = 0x21 + case audioFrame = 0x23 case heartbeat = 0x30 case teardown = 0x31 case cursor = 0x32 diff --git a/docs/audio.md b/docs/audio.md new file mode 100644 index 0000000..615489b --- /dev/null +++ b/docs/audio.md @@ -0,0 +1,120 @@ +# Audio Streaming Architecture & Synchronization + +TargetBridge implements raw, high-fidelity system audio streaming from a sender Mac to a receiver Mac in both **Mirror Mode (Duplicate Desktop)** and **Extended Display Mode (Virtual Desktop)**. The stream is designed for ultra-low latency, real-time synchronization with H.264/HEVC video decoding, and robust scheduling jitter tolerance. + +This document describes the technical architecture, dynamic format conversion pipeline, and the synchronization breakthroughs that eliminated playout lag without sacrificing audio quality. + +--- + +## 🗺️ High-Level Pipeline + +```mermaid +flowchart LR + subgraph Sender (Swift) + A[ScreenCaptureKit] -->|Float32 Non-Interleaved| B[SBAudioConverter] + B -->|AVAudioConverter| C[S16 Interleaved PCM] + C -->|TCP Socket| D[NWConnection] + end + + subgraph Receiver (C) + D -->|TB_PKT_AUDIO_FRAME| E[TCP Parser] + E -->|Locked Resync Check| F[Circular Ring Buffer] + G[SDL Sound Card Thread] -->|audio_callback| F + end +``` + +--- + +## 🎙️ Sender-Side Architecture (Swift) + +### 1. Capture via ScreenCaptureKit +System audio is captured before the master hardware volume or mute is applied. This allows the user to manually mute their MacBook speakers while high-fidelity audio streams to the receiver. +* **`capturesAudio = true`**: Enables audio capture on the `SCStream`. +* **`excludesCurrentProcessAudio = true`**: Prevents the sender from capturing its own system sounds, avoiding feedback loops. +* **QoS Queue**: The capture stream delegates callbacks onto a high-priority `.userInteractive` dispatch queue (`fd.tbmonitor.sender.audio`). + +### 2. Format Conversion (`SBAudioConverter`) +ScreenCaptureKit outputs audio as **Float32 non-interleaved PCM** (separate buffers for left and right channels). +To make it compatible with low-overhead C playback systems (such as SDL2), the Swift sender converts it to standard **16-bit signed interleaved PCM at 48000Hz Stereo** (4 bytes per sample frame). + +The `SBAudioConverter` class executes this: +1. **Pointer Extraction**: Safely extracts the non-interleaved channel buffers using `CMSampleBufferGetAudioBufferListWithRetainedBlockBuffer`. +2. **Hardware-Accelerated Conversion**: Feeds the float pointers to an `AVAudioConverter` configured to transcode into a packed 16-bit signed integer interleaved `AudioStreamBasicDescription` (ASBD). +3. **Low-Allocation Copying**: Performs conversion frame-by-frame with zero persistent copies, preserving thread safety using Swift concurrency locks. + +### 3. Extended Display Mode Capture (Unified SCStream) +To support audio in Extended Display Mode, the capture strategy was unified: +* **The Legacy Approach**: Previously, Extended Display Mode captured the virtual display using the video-only `CGDisplayStream` API. However, `CGDisplayStream` has no audio capture capability. +* **The Modern Solution**: The virtual display capture was migrated to modern **ScreenCaptureKit (`SCStream`)**. + - Since the macOS WindowServer exposes the virtual extended desktop as a standard `SCDisplay` object inside `SCShareableContent.displays`, we can resolve and capture it using `SCStream`. + - By setting `capturesAudio = true` on the virtual display stream, ScreenCaptureKit captures system audio and delivers it alongside the virtual display's H.264 video frames. + - This unifies the entire sender-side pipeline, unlocking high-fidelity, low-latency audio for both Mirror and Extended Display sessions without needing separate capture loops. + +### 4. Per-Session Audio Toggles (Live Muting) +When broadcasting to multiple receivers, audio can be controlled on a per-session basis: +* **Decoupled State**: Each `TBDisplaySenderSession` manages its own `@Published var audioEnabled: Bool` state, initialized using the global preference as a default. +* **On-the-Fly Toggle**: Toggles inside each session card bind directly to that session's state and remain interactive at all times. +* **Instant Playout Cutoff**: The `processAudio` callback verifies `audioEnabled` before every frame conversion. Disabling the toggle stops packet transmission instantly, providing seamless real-time muting for individual targets during active streaming. + +--- + +## 🔊 Receiver-Side Architecture (C) + +The receiver utilizes the cross-platform **SDL2 Audio Subsystem** configured for raw PCM playback: +* **Audio Format**: `AUDIO_S16LSB` (16-bit signed little-endian PCM). +* **Sample Rate**: `48000 Hz`. +* **Channels**: `2` (Stereo). +* **Device Buffering**: Requested at **1024 samples (approx. 21.3ms)**. + +### The Evolution: Why `SDL_QueueAudio` Failed +Initially, the receiver used SDL2's queuing API (`SDL_QueueAudio`) and capped the backlog using `SDL_GetQueuedAudioSize() < 13440` (70ms). This failed due to three factors: +1. **OS-Level Hardware Buffering**: SDL2 immediately drains the external queued buffer into its internal OS/CoreAudio device playback ring buffers. Once the data leaves the SDL queue, `SDL_GetQueuedAudioSize` reports `0` for it, bypassing the backlog threshold and causing up to **1 second of hidden playback buffering**. +2. **Socket Congestion**: During temporary network slow-downs or high H.264 keyframe activity, audio packets accumulate in the TCP transmit/receive socket buffers (configured up to 4MB). When the network clears, the socket drains in a massive burst. Sequencing all these backlogged packets directly into playout caused a permanent, lagging delay. +3. **CPU Busy-Spinning & Thread Starvation**: Initially, the receiver's event loop checked non-blocking network socket events without yielding. This resulted in 100% CPU busy-spinning during active streaming, which created thread-scheduling contention and starved the real-time SDL audio thread. Starving this thread caused sporadic playout underflows and stuttering. Yielding for 1ms via `SDL_Delay(1)` in the main loop when the network socket is idle (0 bytes read) completely resolves this CPU starvation. + +--- + +## ⚡ The Synchronization Breakthroughs + +To resolve the delay without degrading audio quality, the pipeline was rewritten using a **circular ring buffer, a dedicated SDL callback, and a smooth-discard sliding-window resynchronization**. + +### 1. Dedicated Audio Callback (`audio_callback`) +Instead of pushing bytes, we configure SDL2 to pull bytes via an explicit callback: +* The sound card thread requests `len` bytes from the circular buffer. +* If the buffer does not have enough samples (underflow), it fills the remainder with silence (`memset(..., 0)`). This prevents the device from looping old samples, which would cause horrible static/buzzing. + +### 2. Circular Ring Buffer & Thread-Safe Locking +A 1-second circular buffer (`audio_buf`) is added to the receiver's main `app` context: +* The callback reads from the buffer (updating `audio_buf_tail`). +* The TCP socket thread writes incoming network frames to the buffer (updating `audio_buf_head`). +* Since the callback runs on an independent SDL system thread, any modifications to the buffer indexes on the main TCP socket thread are wrapped inside **`SDL_LockAudioDevice`** and **`SDL_UnlockAudioDevice`** to prevent data races. + +### 3. Smooth-Discard (Sliding-Window Resync) +Rather than aggressively clearing/wiping the entire audio buffer when it gets backlogged (which causes silent gaps, sudden dropouts, and loud popping noises), we implement a **smooth-discard sliding window**: + +* We set a strict maximum latency ceiling of **150ms** (equivalent to `150 * 192 = 28800` bytes). +* In `on_packet`'s `TB_PKT_AUDIO_FRAME` handler, we check the total queued size: + ```c + const int cap_bytes = 28800; // 150ms + if (a->audio_buf_size + len > cap_bytes) { + int excess = (a->audio_buf_size + len) - cap_bytes; + a->audio_buf_tail = (a->audio_buf_tail + excess) % AUDIO_BUF_CAP; + a->audio_buf_size -= excess; + } + ``` +* **How it works**: If a burst of socket-backlogged packets arrives, the check immediately triggers. Instead of deleting all data, it **advances the read tail pointer by the exact excess byte count**. +* **The Result**: The oldest, lagging samples are skipped instantly. The circular buffer is left holding exactly **150ms of the newest, most up-to-date audio samples**. +* **Acoustics**: Truncating just the oldest samples in this manner is perceived by the ear as a seamless micro-skip, maintaining crystal-clear playout fidelity, while guaranteeing that audio latency stays perfectly locked to the video stream. + +--- + +## 🛠️ Diagnostics & Tweaking + +Developers can tweak the following properties in `main.c` depending on hardware limits: + +1. **`spec.samples` (Hardware Buffer Size)**: + - Configured at `1024` samples. If run on modern Apple Silicon, this can be safely reduced to `512` (10.6ms) or `256` (5.3ms) for even lower latency. + - For older Intel Macs or high CPU scheduling jitter, keep this at `1024` to prevent scheduling underflows (which cause crackling/static). +2. **`cap_bytes` (Latency Threshold)**: + - Configured at `28800` bytes (150ms) to cushion against ScreenCaptureKit variable delivery chunks and socket congestion. + - If H.264 video decoding takes longer on a specific system, this can be adjusted to match video latency.