diff --git a/TargetBridge-Receiver/TBReceiverC/src/display.c b/TargetBridge-Receiver/TBReceiverC/src/display.c
index 5e11238..d753946 100644
--- a/TargetBridge-Receiver/TBReceiverC/src/display.c
+++ b/TargetBridge-Receiver/TBReceiverC/src/display.c
@@ -241,7 +241,7 @@ struct tb_display *tb_disp_create(int fullscreen) {
      * anisotropic where supported. Must be set BEFORE renderer creation. */
     SDL_SetHint(SDL_HINT_RENDER_SCALE_QUALITY, "best");
 
-    if (SDL_Init(SDL_INIT_VIDEO) < 0) {
+    if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_AUDIO) < 0) {
         fprintf(stderr, "[disp] SDL_Init: %s\n", SDL_GetError());
         return NULL;
     }
diff --git a/TargetBridge-Receiver/TBReceiverC/src/main.c b/TargetBridge-Receiver/TBReceiverC/src/main.c
index bc09439..52ddfc9 100644
--- a/TargetBridge-Receiver/TBReceiverC/src/main.c
+++ b/TargetBridge-Receiver/TBReceiverC/src/main.c
@@ -30,6 +30,8 @@
 #include <time.h>
 #include <unistd.h>
 
+#define AUDIO_BUF_CAP (192000) // 1 second buffer of 48000Hz stereo 16-bit PCM
+
 struct app {
     struct tb_display *disp;
     struct tb_decoder *dec;
@@ -53,6 +55,13 @@ struct app {
 
     DNSServiceRef bonjour_ref;
     char     bonjour_name[128];
+
+    SDL_AudioDeviceID audio_device;
+
+    uint8_t audio_buf[AUDIO_BUF_CAP];
+    int     audio_buf_head;
+    int     audio_buf_tail;
+    int     audio_buf_size;
 };
 
 static volatile sig_atomic_t g_term = 0;
@@ -221,6 +230,29 @@ static void on_frame(const uint8_t *y, int y_stride,
     a->frames++;
 }
 
+static void ring_read(struct app *a, Uint8 *dst, int len) {
+    int first = AUDIO_BUF_CAP - a->audio_buf_tail;
+    if (first >= len) {
+        memcpy(dst, a->audio_buf + a->audio_buf_tail, len);
+    } else {
+        memcpy(dst, a->audio_buf + a->audio_buf_tail, first);
+        memcpy(dst + first, a->audio_buf, len - first);
+    }
+    a->audio_buf_tail = (a->audio_buf_tail + len) % AUDIO_BUF_CAP;
+    a->audio_buf_size -= len;
+}
+
+static void audio_callback(void *userdata, Uint8 *stream, int len) {
+    struct app *a = (struct app *)userdata;
+    if (a->audio_buf_size >= len) {
+        ring_read(a, stream, len);
+    } else {
+        int available = a->audio_buf_size;
+        if (available > 0) ring_read(a, stream, available);
+        memset(stream + available, 0, len - available);
+    }
+}
+
 /* ---- Callbacks: parser → decoder ------------------------------------- */
 
 static void on_packet(uint8_t type, const uint8_t *payload, size_t len, void *ud) {
@@ -290,6 +322,35 @@ static void on_packet(uint8_t type, const uint8_t *payload, size_t len, void *ud
             tb_disp_set_cursor(a->disp, x, y, w, h, visible, type);
         }
         break;
+    case TB_PKT_AUDIO_FRAME:
+        if (a->audio_device != 0) {
+            SDL_LockAudioDevice(a->audio_device);
+            
+            // Limit audio backlog to 150ms (150 * 192 = 28800 bytes) to cushion against network/scheduling jitter.
+            // If the buffer would exceed this, smoothly discard the oldest excess bytes.
+            const int cap_bytes = 28800;
+            if (a->audio_buf_size + len > cap_bytes) {
+                int excess = (a->audio_buf_size + len) - cap_bytes;
+                a->audio_buf_tail = (a->audio_buf_tail + excess) % AUDIO_BUF_CAP;
+                a->audio_buf_size -= excess;
+            }
+            
+            // Write payload to circular buffer
+            if (a->audio_buf_size + (int)len <= AUDIO_BUF_CAP) {
+                int first = AUDIO_BUF_CAP - a->audio_buf_head;
+                if (first >= (int)len) {
+                    memcpy(a->audio_buf + a->audio_buf_head, payload, len);
+                } else {
+                    memcpy(a->audio_buf + a->audio_buf_head, payload, first);
+                    memcpy(a->audio_buf, payload + first, len - first);
+                }
+                a->audio_buf_head = (a->audio_buf_head + (int)len) % AUDIO_BUF_CAP;
+                a->audio_buf_size += (int)len;
+            }
+            
+            SDL_UnlockAudioDevice(a->audio_device);
+        }
+        break;
     case TB_PKT_HEARTBEAT:
         break;
     case TB_PKT_TEST_DATA:
@@ -308,11 +369,13 @@ static void on_packet(uint8_t type, const uint8_t *payload, size_t len, void *ud
 
 /* ---- Networking helpers ---------------------------------------------- */
 
-static int drain_socket(struct app *a) {
+static int drain_socket(struct app *a, int *bytes_read) {
     uint8_t buf[1024 * 1024];
+    if (bytes_read) *bytes_read = 0;
     for (;;) {
         ssize_t n = read(a->client_fd, buf, sizeof(buf));
         if (n > 0) {
+            if (bytes_read) *bytes_read += n;
             if (tb_parser_feed(&a->parser, buf, (size_t)n) < 0) return -1;
         } else if (n == 0) {
             return -1;  /* peer closed */
@@ -422,6 +485,14 @@ static void close_client(struct app *a) {
     tb_parser_free(&a->parser);
     tb_parser_init(&a->parser, on_packet, a);
     tb_dec_reset(a->dec);   /* fresh decoder for next session */
+    if (a->audio_device != 0) {
+        SDL_LockAudioDevice(a->audio_device);
+        a->audio_buf_head = 0;
+        a->audio_buf_tail = 0;
+        a->audio_buf_size = 0;
+        SDL_UnlockAudioDevice(a->audio_device);
+    }
+
     fprintf(stderr, "[main] client disconnected\n");
 }
 
@@ -464,6 +535,24 @@ int main(int argc, char **argv) {
     a.disp = tb_disp_create(fullscreen);
     if (!a.disp) { fprintf(stderr, "tb_disp_create failed\n"); return 1; }
 
+    /* Open SDL Audio Device */
+    SDL_AudioSpec spec;
+    SDL_zero(spec);
+    spec.freq = 48000;
+    spec.format = AUDIO_S16LSB; // 16-bit signed, little-endian PCM
+    spec.channels = 2;          // Stereo
+    spec.samples = 1024;        // Buffer size (approx 21.3ms)
+    spec.callback = audio_callback;
+    spec.userdata = &a;
+    SDL_AudioSpec obtained;
+    a.audio_device = SDL_OpenAudioDevice(NULL, 0, &spec, &obtained, 0);
+    if (a.audio_device != 0) {
+        SDL_PauseAudioDevice(a.audio_device, 0); // Start playing (unpaused)
+        fprintf(stderr, "[main] SDL audio device opened: 48000Hz stereo 16-bit PCM (obtained %d samples)\n", obtained.samples);
+    } else {
+        fprintf(stderr, "[main] warning: SDL_OpenAudioDevice failed: %s\n", SDL_GetError());
+    }
+
     struct tb_display_info boot_info;
     if (tb_disp_get_info(a.disp, &boot_info) == 0) {
         snprintf(a.panel_text, sizeof(a.panel_text), "%u x %u px (%s)",
@@ -486,6 +575,7 @@ int main(int argc, char **argv) {
 
     while (!g_term && !tb_disp_poll_quit(a.disp)) {
         uint64_t t = now_ms();
+        int bytes_read = 0;
 
         if (t - a.last_ip_check_ms >= 1000) {
             char refreshed_ip[64] = {0};
@@ -511,7 +601,7 @@ int main(int argc, char **argv) {
                 send_receiver_info(&a);
             }
         } else {
-            if (drain_socket(&a) < 0) close_client(&a);
+            if (drain_socket(&a, &bytes_read) < 0) close_client(&a);
             else if (a.close_requested) close_client(&a);
         }
 
@@ -528,8 +618,9 @@ int main(int argc, char **argv) {
         }
 
         /* Yield only while idle. During active video, keep draining and
-         * rendering without injecting an extra millisecond of latency. */
-        if (a.client_fd < 0 || !a.have_video_frame) {
+         * rendering without injecting an extra millisecond of latency.
+         * If we didn't read any bytes from the socket, we can safely yield 1ms. */
+        if (a.client_fd < 0 || !a.have_video_frame || bytes_read == 0) {
             SDL_Delay(1);
         }
     }
@@ -539,6 +630,9 @@ int main(int argc, char **argv) {
     bonjour_deinit(&a);
     tb_parser_free(&a.parser);
     tb_dec_destroy(a.dec);
+    if (a.audio_device != 0) {
+        SDL_CloseAudioDevice(a.audio_device);
+    }
     tb_disp_destroy(a.disp);
     fprintf(stderr, "[main] bye\n");
     return 0;
diff --git a/TargetBridge-Receiver/TBReceiverC/src/proto.h b/TargetBridge-Receiver/TBReceiverC/src/proto.h
index f2d4124..75c853c 100644
--- a/TargetBridge-Receiver/TBReceiverC/src/proto.h
+++ b/TargetBridge-Receiver/TBReceiverC/src/proto.h
@@ -30,6 +30,7 @@
 #define TB_PKT_CREATE_SESSION_ACK 0x12
 #define TB_PKT_PARAM_SETS       0x20
 #define TB_PKT_FRAME            0x21
+#define TB_PKT_AUDIO_FRAME      0x23
 #define TB_PKT_HEARTBEAT        0x30
 #define TB_PKT_TEARDOWN         0x31
 #define TB_PKT_CURSOR           0x32
diff --git a/TargetBridge-Sender/TBDisplaySender/TBDisplaySenderContentView.swift b/TargetBridge-Sender/TBDisplaySender/TBDisplaySenderContentView.swift
index 39d0702..d44f88f 100644
--- a/TargetBridge-Sender/TBDisplaySender/TBDisplaySenderContentView.swift
+++ b/TargetBridge-Sender/TBDisplaySender/TBDisplaySenderContentView.swift
@@ -163,6 +163,9 @@ struct TBDisplaySenderContentView: View {
 
                 Toggle(TBDisplaySenderL10n.largeCursor(service.language), isOn: $service.largeCursor)
                     .disabled(service.anyConnected)
+
+                Toggle(TBDisplaySenderL10n.streamAudio(service.language), isOn: $service.audioEnabled)
+                    .disabled(service.anyConnected)
             }
         }
     }
@@ -266,6 +269,11 @@ private struct TBDisplaySenderSessionCard: View {
                         .disabled(session.isConnected || session.isStreaming)
                     }
 
+                    controlRow(TBDisplaySenderL10n.streamAudio(service.language)) {
+                        Toggle("", isOn: $session.audioEnabled)
+                            .labelsHidden()
+                    }
+
                     VStack(alignment: .leading, spacing: 4) {
                         Text(TBDisplaySenderL10n.streamHint1(service.language))
                         Text(TBDisplaySenderL10n.streamHint2(service.language))
diff --git a/TargetBridge-Sender/TBDisplaySender/TBDisplaySenderLocalization.swift b/TargetBridge-Sender/TBDisplaySender/TBDisplaySenderLocalization.swift
index 18fa916..fa92e90 100644
--- a/TargetBridge-Sender/TBDisplaySender/TBDisplaySenderLocalization.swift
+++ b/TargetBridge-Sender/TBDisplaySender/TBDisplaySenderLocalization.swift
@@ -480,6 +480,14 @@ enum TBDisplaySenderL10n {
         }
     }
 
+    static func streamAudio(_ language: TBDisplaySenderLanguage) -> String {
+        switch language {
+        case .italian: return "Trasmetti audio del Mac"
+        case .english: return "Stream Mac audio"
+        case .german: return "Mac-Audio übertragen"
+        }
+    }
+
     static func showMainWindow(_ language: TBDisplaySenderLanguage) -> String {
         switch language {
         case .italian: return "Mostra finestra principale"
diff --git a/TargetBridge-Sender/TBDisplaySender/TBDisplaySenderManager.swift b/TargetBridge-Sender/TBDisplaySender/TBDisplaySenderManager.swift
index 929d4c2..72177f0 100644
--- a/TargetBridge-Sender/TBDisplaySender/TBDisplaySenderManager.swift
+++ b/TargetBridge-Sender/TBDisplaySender/TBDisplaySenderManager.swift
@@ -31,6 +31,12 @@ final class TBDisplaySenderService: ObservableObject {
             objectWillChange.send()
         }
     }
+    @Published var audioEnabled: Bool = UserDefaults.standard.object(forKey: "fd.tbdisplaysender.audioEnabled") as? Bool ?? true {
+        didSet {
+            UserDefaults.standard.set(audioEnabled, forKey: "fd.tbdisplaysender.audioEnabled")
+            objectWillChange.send()
+        }
+    }
 
     private var sessionCancellables: [UUID: AnyCancellable] = [:]
     private let receiverDiscovery = TBReceiverDiscovery()
@@ -70,7 +76,7 @@ final class TBDisplaySenderService: ObservableObject {
     }
 
     func addSession() {
-        let session = TBDisplaySenderSession(language: language, largeCursor: largeCursor)
+        let session = TBDisplaySenderSession(language: language, largeCursor: largeCursor, audioEnabled: audioEnabled)
         if let previous = sessions.last {
             session.capturePreset = previous.capturePreset
             session.captureSource = previous.captureSource
diff --git a/TargetBridge-Sender/TBDisplaySender/TBDisplaySenderService.swift b/TargetBridge-Sender/TBDisplaySender/TBDisplaySenderService.swift
index 5238797..3cad571 100644
--- a/TargetBridge-Sender/TBDisplaySender/TBDisplaySenderService.swift
+++ b/TargetBridge-Sender/TBDisplaySender/TBDisplaySenderService.swift
@@ -4,6 +4,7 @@ import CoreMedia
 import CoreVideo
 import Darwin
 import Foundation
+import AVFoundation
 import IOSurface
 import Network
 @preconcurrency import ScreenCaptureKit
@@ -111,7 +112,7 @@ enum TBDisplayCapturePreset: String, CaseIterable, Identifiable {
         if let envVal = ProcessInfo.processInfo.environment["QD"], let parsed = Int(envVal) {
             return parsed
         }
-        return 2
+        return 6
     }
 
     var expectedFrameRate: Int {
@@ -170,7 +171,7 @@ enum TBDisplayCapturePreset: String, CaseIterable, Identifiable {
         if let envVal = ProcessInfo.processInfo.environment["MPVP"], let parsed = Int(envVal) {
             return parsed
         }
-        return 3
+        return 10
     }
 
     var maxFrameDelayCount: Int {
@@ -195,7 +196,7 @@ enum TBDisplayCapturePreset: String, CaseIterable, Identifiable {
         if let envVal = ProcessInfo.processInfo.environment["MIFEF"], let parsed = Int(envVal) {
             return parsed
         }
-        return 5
+        return 6
     }
 
     var captureResolution: SCCaptureResolutionType {
@@ -306,6 +307,7 @@ private final class TBDirectDisplayStreamCapture {
 @MainActor
 final class TBDisplaySenderSession: NSObject, ObservableObject, Identifiable, @unchecked Sendable {
     private static let receiverIPDefaultsKey = "fd.tbdisplaysender.receiverIP"
+    private static let audioEnabledDefaultsKey = "fd.tbdisplaysender.audioEnabled"
     private struct SavedExtendedDisplayArrangement {
         let x: Int32
         let y: Int32
@@ -394,12 +396,13 @@ final class TBDisplaySenderSession: NSObject, ObservableObject, Identifiable, @u
 
     let id = UUID()
 
-    init(language: TBDisplaySenderLanguage, largeCursor: Bool) {
+    init(language: TBDisplaySenderLanguage, largeCursor: Bool, audioEnabled: Bool) {
         self.statusText = TBDisplaySenderStatusState.ready.text(language)
         self.receiverPanelText = TBDisplaySenderL10n.waitingReceiverProfile(language)
         self.virtualDisplayText = TBDisplaySenderL10n.virtualDisplayNotCreated(language)
         self.language = language
         self.largeCursor = largeCursor
+        self.audioEnabled = audioEnabled
         self.streamResolutionText = TBDisplaySenderL10n.streamSummary(
             preset: .standard1440p,
             source: .desktopMirror,
@@ -421,6 +424,7 @@ final class TBDisplaySenderSession: NSObject, ObservableObject, Identifiable, @u
             UserDefaults.standard.set(receiverIP, forKey: Self.receiverIPDefaultsKey)
         }
     }
+    @Published var audioEnabled: Bool
     @Published var senderFPS = 0
     @Published var receiverPanelText: String
     @Published var virtualDisplayText: String
@@ -453,6 +457,7 @@ final class TBDisplaySenderSession: NSObject, ObservableObject, Identifiable, @u
     private var recvBuffer = Data()
 
     private var session = ReceiverBackedVirtualDisplaySession()
+    private let audioConverter = SBAudioConverter()
     private var activeProfile: TBMonitorDisplayProfile?
 
     private var captureDelegate: CaptureDelegate?
@@ -480,6 +485,7 @@ final class TBDisplaySenderSession: NSObject, ObservableObject, Identifiable, @u
 
     private final class CaptureDelegate: NSObject, SCStreamOutput, SCStreamDelegate {
         var onFrame: ((CMSampleBuffer) -> Void)?
+        var onAudio: ((CMSampleBuffer) -> Void)?
         var onError: ((Error) -> Void)?
 
         private static func shouldProcessFrame(_ sampleBuffer: CMSampleBuffer) -> Bool {
@@ -504,6 +510,10 @@ final class TBDisplaySenderSession: NSObject, ObservableObject, Identifiable, @u
         nonisolated func stream(_ stream: SCStream,
                                 didOutputSampleBuffer sampleBuffer: CMSampleBuffer,
                                 of type: SCStreamOutputType) {
+            if type == .audio {
+                onAudio?(sampleBuffer)
+                return
+            }
             guard type == .screen else { return }
             guard Self.shouldProcessFrame(sampleBuffer) else { return }
             onFrame?(sampleBuffer)
@@ -1000,15 +1010,25 @@ final class TBDisplaySenderSession: NSObject, ObservableObject, Identifiable, @u
         do {
             let preset = capturePreset
 
-            if captureSource == .extendedDesktop, session.displayID != kCGNullDirectDisplay {
-                if startDirectDisplayStream(displayID: session.displayID, preset: preset) {
-                    return true
+            let display: SCDisplay
+            if captureSource == .desktopMirror {
+                // In mirror mode, the virtual display mirrors the main display.
+                // We capture the main display directly via ScreenCaptureKit to stream both video and system audio.
+                let content = try await SCShareableContent.excludingDesktopWindows(false, onScreenWindowsOnly: false)
+                guard let mainDisplay = content.displays.first(where: { $0.displayID == CGMainDisplayID() }) else {
+                    return false
+                }
+                display = mainDisplay
+            } else {
+                // In extended desktop mode, we capture the virtual display using ScreenCaptureKit
+                // to support both video and high-fidelity system audio.
+                let content = try await SCShareableContent.excludingDesktopWindows(false, onScreenWindowsOnly: false)
+                if session.displayID != kCGNullDirectDisplay,
+                   let targetDisplay = content.displays.first(where: { $0.displayID == session.displayID }) {
+                    display = targetDisplay
+                } else {
+                    display = try await waitForCaptureDisplay()
                 }
-            }
-
-            let display = try await waitForCaptureDisplay()
-            if startDirectDisplayStream(displayID: display.displayID, preset: preset) {
-                return true
             }
 
             let configuration = SCStreamConfiguration()
@@ -1020,6 +1040,10 @@ final class TBDisplaySenderSession: NSObject, ObservableObject, Identifiable, @u
             configuration.showsCursor = !largeCursor
             configuration.scalesToFit = true
             configuration.captureResolution = preset.captureResolution
+            configuration.capturesAudio = true
+            configuration.excludesCurrentProcessAudio = true
+            configuration.sampleRate = 48000
+            configuration.channelCount = 2
 
             setupEncoder(
                 width: preset.width,
@@ -1034,6 +1058,9 @@ final class TBDisplaySenderSession: NSObject, ObservableObject, Identifiable, @u
             delegate.onFrame = { [weak self] sampleBuffer in
                 self?.encode(sampleBuffer)
             }
+            delegate.onAudio = { [weak self] sampleBuffer in
+                self?.processAudio(sampleBuffer)
+            }
             delegate.onError = { [weak self] error in
                 Task { @MainActor [weak self] in
                     guard let self else { return }
@@ -1051,6 +1078,11 @@ final class TBDisplaySenderSession: NSObject, ObservableObject, Identifiable, @u
                 type: .screen,
                 sampleHandlerQueue: DispatchQueue(label: "fd.tbmonitor.sender.capture", qos: .userInteractive)
             )
+            try stream.addStreamOutput(
+                delegate,
+                type: .audio,
+                sampleHandlerQueue: DispatchQueue(label: "fd.tbmonitor.sender.audio", qos: .userInteractive)
+            )
             try await stream.startCapture()
             scStream = stream
             isStreaming = true
@@ -1683,8 +1715,137 @@ final class TBDisplaySenderSession: NSObject, ObservableObject, Identifiable, @u
         }
     }
 
+    private func processAudio(_ sampleBuffer: CMSampleBuffer) {
+        guard audioEnabled else { return }
+        guard let data = audioConverter.convert(sampleBuffer: sampleBuffer) else { return }
+        let packet = TBMonitorProtocol.makePacket(type: .audioFrame, payload: data)
+        send(packet)
+    }
+
     private func send(_ packet: Data) {
         connection?.send(content: packet, completion: .contentProcessed({ _ in }))
     }
 
 }
+
+private final class SBAudioConverter: Sendable {
+    private let converterState: LockedConverterState = LockedConverterState()
+
+    private final class LockedConverterState: @unchecked Sendable {
+        private let lock = NSLock()
+        var converter: AVAudioConverter?
+        var inputFormat: AVAudioFormat?
+        let outputFormat: AVAudioFormat
+
+        init() {
+            var asbd = AudioStreamBasicDescription(
+                mSampleRate: 48000.0,
+                mFormatID: kAudioFormatLinearPCM,
+                mFormatFlags: kAudioFormatFlagIsSignedInteger | kAudioFormatFlagIsPacked,
+                mBytesPerPacket: 4,
+                mFramesPerPacket: 1,
+                mBytesPerFrame: 4,
+                mChannelsPerFrame: 2,
+                mBitsPerChannel: 16,
+                mReserved: 0
+            )
+            self.outputFormat = AVAudioFormat(streamDescription: &asbd)!
+        }
+
+        func convert(sampleBuffer: CMSampleBuffer) -> Data? {
+            lock.lock()
+            defer { lock.unlock() }
+
+            guard let formatDesc = CMSampleBufferGetFormatDescription(sampleBuffer) else { return nil }
+            guard let asbdPointer = CMAudioFormatDescriptionGetStreamBasicDescription(formatDesc) else { return nil }
+            let inputASBD = asbdPointer.pointee
+
+            // Recreate converter if input format changes
+            if inputFormat == nil ||
+               inputFormat!.streamDescription.pointee.mFormatFlags != inputASBD.mFormatFlags ||
+               inputFormat!.streamDescription.pointee.mSampleRate != inputASBD.mSampleRate ||
+               inputFormat!.streamDescription.pointee.mChannelsPerFrame != inputASBD.mChannelsPerFrame {
+                var mutableASBD = inputASBD
+                guard let inFormat = AVAudioFormat(streamDescription: &mutableASBD) else { return nil }
+                self.inputFormat = inFormat
+                self.converter = AVAudioConverter(from: inFormat, to: outputFormat)
+            }
+
+            guard let converter = self.converter, let inFormat = self.inputFormat else { return nil }
+
+            let frameCount = sampleBuffer.numSamples
+            guard frameCount > 0 else { return nil }
+            let audioFrameCount = AVAudioFrameCount(frameCount)
+
+            // Create input buffer
+            guard let inputBuffer = AVAudioPCMBuffer(pcmFormat: inFormat, frameCapacity: audioFrameCount) else { return nil }
+            inputBuffer.frameLength = audioFrameCount
+
+            // Extract audio data from sampleBuffer into inputBuffer
+            let channelCount = Int(inFormat.channelCount)
+            let bufferListSize = MemoryLayout<AudioBufferList>.size + (channelCount - 1) * MemoryLayout<AudioBuffer>.size
+            let bufferListRaw = UnsafeMutableRawPointer.allocate(byteCount: bufferListSize, alignment: MemoryLayout<AudioBufferList>.alignment)
+            defer { bufferListRaw.deallocate() }
+
+            let ablPointer = bufferListRaw.assumingMemoryBound(to: AudioBufferList.self)
+            var blockBuffer: CMBlockBuffer?
+
+            let status = CMSampleBufferGetAudioBufferListWithRetainedBlockBuffer(
+                sampleBuffer,
+                bufferListSizeNeededOut: nil,
+                bufferListOut: ablPointer,
+                bufferListSize: bufferListSize,
+                blockBufferAllocator: nil,
+                blockBufferMemoryAllocator: nil,
+                flags: kCMSampleBufferFlag_AudioBufferList_Assure16ByteAlignment,
+                blockBufferOut: &blockBuffer
+            )
+
+            guard status == noErr else { return nil }
+
+            let firstBufferPtr = withUnsafeMutablePointer(to: &ablPointer.pointee.mBuffers) { $0 }
+            let buffers = UnsafeBufferPointer(start: firstBufferPtr, count: channelCount)
+
+            if inFormat.isInterleaved {
+                // ScreenCaptureKit always delivers float32 non-interleaved audio.
+                // This branch should never execute; assert to catch unexpected format changes.
+                assertionFailure("SBAudioConverter: unexpected interleaved input format from ScreenCaptureKit")
+                return nil
+            } else {
+                for i in 0..<channelCount {
+                    if let dest = inputBuffer.floatChannelData?[i], let src = buffers[i].mData {
+                        memcpy(dest, src, Int(buffers[i].mDataByteSize))
+                    }
+                }
+            }
+
+            // Perform conversion to outputFormat
+            guard let outputBuffer = AVAudioPCMBuffer(pcmFormat: outputFormat, frameCapacity: audioFrameCount) else { return nil }
+
+            var error: NSError?
+            var inputConsumed = false
+            let convertStatus = converter.convert(to: outputBuffer, error: &error) { inNumPackets, outStatus in
+                if inputConsumed {
+                    outStatus.pointee = .noDataNow
+                    return nil
+                }
+                inputConsumed = true
+                outStatus.pointee = .haveData
+                return inputBuffer
+            }
+
+            if convertStatus == .error || error != nil {
+                return nil
+            }
+
+            guard let channels = outputBuffer.int16ChannelData else { return nil }
+            let dataSize = Int(outputBuffer.frameLength) * 4 // 2 channels * 2 bytes = 4 bytes per frame
+            let rawPointer = UnsafeRawPointer(channels.pointee)
+            return Data(bytes: rawPointer, count: dataSize)
+        }
+    }
+
+    func convert(sampleBuffer: CMSampleBuffer) -> Data? {
+        return converterState.convert(sampleBuffer: sampleBuffer)
+    }
+}
diff --git a/TargetBridge-Sender/TBDisplayShared/TBMonitorProtocol.swift b/TargetBridge-Sender/TBDisplayShared/TBMonitorProtocol.swift
index ec93814..d1a5bf2 100644
--- a/TargetBridge-Sender/TBDisplayShared/TBMonitorProtocol.swift
+++ b/TargetBridge-Sender/TBDisplayShared/TBMonitorProtocol.swift
@@ -6,6 +6,7 @@ enum TBMonitorPacketType: UInt8 {
     case createSessionAck = 0x12
     case paramSets = 0x20
     case frame = 0x21
+    case audioFrame = 0x23
     case heartbeat = 0x30
     case teardown = 0x31
     case cursor = 0x32
diff --git a/docs/audio.md b/docs/audio.md
new file mode 100644
index 0000000..615489b
--- /dev/null
+++ b/docs/audio.md
@@ -0,0 +1,120 @@
+# Audio Streaming Architecture & Synchronization
+
+TargetBridge implements raw, high-fidelity system audio streaming from a sender Mac to a receiver Mac in both **Mirror Mode (Duplicate Desktop)** and **Extended Display Mode (Virtual Desktop)**. The stream is designed for ultra-low latency, real-time synchronization with H.264/HEVC video decoding, and robust scheduling jitter tolerance.
+
+This document describes the technical architecture, dynamic format conversion pipeline, and the synchronization breakthroughs that eliminated playout lag without sacrificing audio quality.
+
+---
+
+## 🗺️ High-Level Pipeline
+
+```mermaid
+flowchart LR
+    subgraph Sender (Swift)
+        A[ScreenCaptureKit] -->|Float32 Non-Interleaved| B[SBAudioConverter]
+        B -->|AVAudioConverter| C[S16 Interleaved PCM]
+        C -->|TCP Socket| D[NWConnection]
+    end
+
+    subgraph Receiver (C)
+        D -->|TB_PKT_AUDIO_FRAME| E[TCP Parser]
+        E -->|Locked Resync Check| F[Circular Ring Buffer]
+        G[SDL Sound Card Thread] -->|audio_callback| F
+    end
+```
+
+---
+
+## 🎙️ Sender-Side Architecture (Swift)
+
+### 1. Capture via ScreenCaptureKit
+System audio is captured before the master hardware volume or mute is applied. This allows the user to manually mute their MacBook speakers while high-fidelity audio streams to the receiver.
+* **`capturesAudio = true`**: Enables audio capture on the `SCStream`.
+* **`excludesCurrentProcessAudio = true`**: Prevents the sender from capturing its own system sounds, avoiding feedback loops.
+* **QoS Queue**: The capture stream delegates callbacks onto a high-priority `.userInteractive` dispatch queue (`fd.tbmonitor.sender.audio`).
+
+### 2. Format Conversion (`SBAudioConverter`)
+ScreenCaptureKit outputs audio as **Float32 non-interleaved PCM** (separate buffers for left and right channels). 
+To make it compatible with low-overhead C playback systems (such as SDL2), the Swift sender converts it to standard **16-bit signed interleaved PCM at 48000Hz Stereo** (4 bytes per sample frame).
+
+The `SBAudioConverter` class executes this:
+1. **Pointer Extraction**: Safely extracts the non-interleaved channel buffers using `CMSampleBufferGetAudioBufferListWithRetainedBlockBuffer`.
+2. **Hardware-Accelerated Conversion**: Feeds the float pointers to an `AVAudioConverter` configured to transcode into a packed 16-bit signed integer interleaved `AudioStreamBasicDescription` (ASBD).
+3. **Low-Allocation Copying**: Performs conversion frame-by-frame with zero persistent copies, preserving thread safety using Swift concurrency locks.
+
+### 3. Extended Display Mode Capture (Unified SCStream)
+To support audio in Extended Display Mode, the capture strategy was unified:
+* **The Legacy Approach**: Previously, Extended Display Mode captured the virtual display using the video-only `CGDisplayStream` API. However, `CGDisplayStream` has no audio capture capability.
+* **The Modern Solution**: The virtual display capture was migrated to modern **ScreenCaptureKit (`SCStream`)**.
+  - Since the macOS WindowServer exposes the virtual extended desktop as a standard `SCDisplay` object inside `SCShareableContent.displays`, we can resolve and capture it using `SCStream`.
+  - By setting `capturesAudio = true` on the virtual display stream, ScreenCaptureKit captures system audio and delivers it alongside the virtual display's H.264 video frames.
+  - This unifies the entire sender-side pipeline, unlocking high-fidelity, low-latency audio for both Mirror and Extended Display sessions without needing separate capture loops.
+
+### 4. Per-Session Audio Toggles (Live Muting)
+When broadcasting to multiple receivers, audio can be controlled on a per-session basis:
+* **Decoupled State**: Each `TBDisplaySenderSession` manages its own `@Published var audioEnabled: Bool` state, initialized using the global preference as a default.
+* **On-the-Fly Toggle**: Toggles inside each session card bind directly to that session's state and remain interactive at all times.
+* **Instant Playout Cutoff**: The `processAudio` callback verifies `audioEnabled` before every frame conversion. Disabling the toggle stops packet transmission instantly, providing seamless real-time muting for individual targets during active streaming.
+
+---
+
+## 🔊 Receiver-Side Architecture (C)
+
+The receiver utilizes the cross-platform **SDL2 Audio Subsystem** configured for raw PCM playback:
+* **Audio Format**: `AUDIO_S16LSB` (16-bit signed little-endian PCM).
+* **Sample Rate**: `48000 Hz`.
+* **Channels**: `2` (Stereo).
+* **Device Buffering**: Requested at **1024 samples (approx. 21.3ms)**.
+
+### The Evolution: Why `SDL_QueueAudio` Failed
+Initially, the receiver used SDL2's queuing API (`SDL_QueueAudio`) and capped the backlog using `SDL_GetQueuedAudioSize() < 13440` (70ms). This failed due to three factors:
+1. **OS-Level Hardware Buffering**: SDL2 immediately drains the external queued buffer into its internal OS/CoreAudio device playback ring buffers. Once the data leaves the SDL queue, `SDL_GetQueuedAudioSize` reports `0` for it, bypassing the backlog threshold and causing up to **1 second of hidden playback buffering**.
+2. **Socket Congestion**: During temporary network slow-downs or high H.264 keyframe activity, audio packets accumulate in the TCP transmit/receive socket buffers (configured up to 4MB). When the network clears, the socket drains in a massive burst. Sequencing all these backlogged packets directly into playout caused a permanent, lagging delay.
+3. **CPU Busy-Spinning & Thread Starvation**: Initially, the receiver's event loop checked non-blocking network socket events without yielding. This resulted in 100% CPU busy-spinning during active streaming, which created thread-scheduling contention and starved the real-time SDL audio thread. Starving this thread caused sporadic playout underflows and stuttering. Yielding for 1ms via `SDL_Delay(1)` in the main loop when the network socket is idle (0 bytes read) completely resolves this CPU starvation.
+
+---
+
+## ⚡ The Synchronization Breakthroughs
+
+To resolve the delay without degrading audio quality, the pipeline was rewritten using a **circular ring buffer, a dedicated SDL callback, and a smooth-discard sliding-window resynchronization**.
+
+### 1. Dedicated Audio Callback (`audio_callback`)
+Instead of pushing bytes, we configure SDL2 to pull bytes via an explicit callback:
+* The sound card thread requests `len` bytes from the circular buffer.
+* If the buffer does not have enough samples (underflow), it fills the remainder with silence (`memset(..., 0)`). This prevents the device from looping old samples, which would cause horrible static/buzzing.
+
+### 2. Circular Ring Buffer & Thread-Safe Locking
+A 1-second circular buffer (`audio_buf`) is added to the receiver's main `app` context:
+* The callback reads from the buffer (updating `audio_buf_tail`).
+* The TCP socket thread writes incoming network frames to the buffer (updating `audio_buf_head`).
+* Since the callback runs on an independent SDL system thread, any modifications to the buffer indexes on the main TCP socket thread are wrapped inside **`SDL_LockAudioDevice`** and **`SDL_UnlockAudioDevice`** to prevent data races.
+
+### 3. Smooth-Discard (Sliding-Window Resync)
+Rather than aggressively clearing/wiping the entire audio buffer when it gets backlogged (which causes silent gaps, sudden dropouts, and loud popping noises), we implement a **smooth-discard sliding window**:
+
+* We set a strict maximum latency ceiling of **150ms** (equivalent to `150 * 192 = 28800` bytes).
+* In `on_packet`'s `TB_PKT_AUDIO_FRAME` handler, we check the total queued size:
+  ```c
+  const int cap_bytes = 28800; // 150ms
+  if (a->audio_buf_size + len > cap_bytes) {
+      int excess = (a->audio_buf_size + len) - cap_bytes;
+      a->audio_buf_tail = (a->audio_buf_tail + excess) % AUDIO_BUF_CAP;
+      a->audio_buf_size -= excess;
+  }
+  ```
+* **How it works**: If a burst of socket-backlogged packets arrives, the check immediately triggers. Instead of deleting all data, it **advances the read tail pointer by the exact excess byte count**.
+* **The Result**: The oldest, lagging samples are skipped instantly. The circular buffer is left holding exactly **150ms of the newest, most up-to-date audio samples**.
+* **Acoustics**: Truncating just the oldest samples in this manner is perceived by the ear as a seamless micro-skip, maintaining crystal-clear playout fidelity, while guaranteeing that audio latency stays perfectly locked to the video stream.
+
+---
+
+## 🛠️ Diagnostics & Tweaking
+
+Developers can tweak the following properties in `main.c` depending on hardware limits:
+
+1. **`spec.samples` (Hardware Buffer Size)**:
+   - Configured at `1024` samples. If run on modern Apple Silicon, this can be safely reduced to `512` (10.6ms) or `256` (5.3ms) for even lower latency.
+   - For older Intel Macs or high CPU scheduling jitter, keep this at `1024` to prevent scheduling underflows (which cause crackling/static).
+2. **`cap_bytes` (Latency Threshold)**:
+   - Configured at `28800` bytes (150ms) to cushion against ScreenCaptureKit variable delivery chunks and socket congestion.
+   - If H.264 video decoding takes longer on a specific system, this can be adjusted to match video latency.