slick-queue-py/slick_queue_py.py at main · SlickQuant/slick-queue-py · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
"""
Python implementation of SlickQueue-compatible shared memory queue.

This implements the same memory layout as the C++ `slick::SlickQueue<T>`
header (64 bytes), an array of `slot` structures starting at offset 64, and
the data array immediately after the slot array.

Multi-Producer Multi-Consumer Support:
- This implementation now uses atomic operations via the atomic_ops module
- On platforms with hardware 128-bit CAS support (x86-64 with CMPXCHG16B),
  provides true lock-free multi-producer and multi-consumer semantics
- On other platforms, falls back to lock-based synchronization

C++/Python Interoperability:
- Python processes can produce/consume to queues created by C++
- C++ processes can produce/consume to queues created by Python
- Memory layout and atomic operations match exactly

Supported on Python 3.8+ (uses multiprocessing.shared_memory).
"""
from __future__ import annotations

__version__ = '1.1.0'

import struct
import sys
from typing import Optional, Tuple, Union
from atomic_ops import AtomicReservedInfo, AtomicUInt64, AtomicCursor, check_platform_support, make_reserved_info, get_index, get_size

# Use Python's built-in shared memory (available in Python 3.8+)
from multiprocessing.shared_memory import SharedMemory

# Layout constants
# Shared memory header layout (64 bytes total):
# Offset 0-7:   std::atomic<reserved_info> (8 bytes)
# Offset 8-11:  size_ (uint32_t)
# Offset 12-15: element_size (uint32_t)
# Offset 16-23: std::atomic<uint64_t> last_published_ (8 bytes)
# Offset 24-27: header_magic (uint32_t) - value 0x534C5131 ('SLQ1')
# Offset 28-47: PADDING (20 bytes)
# Offset 48-51: init_state (atomic uint32_t)
# Offset 52-63: PADDING (12 bytes)
HEADER_SIZE = 64
RESERVED_INFO_SIZE = struct.calcsize(AtomicReservedInfo.RESERVED_INFO_FMT)  # 8 bytes
SIZE_OFFSET = 8
ELEMENT_SIZE_OFFSET = 12
LAST_PUBLISHED_OFFSET = 16
HEADER_MAGIC_OFFSET = 24
HEADER_MAGIC = 0x534C5131  # 'SLQ1' in little-endian
INIT_STATE_OFFSET = 48

# Init state constants (matches C++ queue.h)
INIT_STATE_UNINITIALIZED = 0
INIT_STATE_LEGACY = 1
INIT_STATE_INITIALIZING = 2
INIT_STATE_READY = 3

# Invalid index constant
K_INVALID_INDEX = 2**64 - 1

# slot: atomic_uint64 data_index; uint32 size; 4 bytes padding => 16 bytes
SLOT_FMT = "<Q I 4x"
SLOT_SIZE = struct.calcsize(SLOT_FMT)


class SlickQueue:
    """A fixed-size ring queue compatible with C++ SlickQueue.

    Supports two modes:
    - **Shared memory mode** (when name is provided): Uses shared memory for inter-process communication
    - **Local memory mode** (when name is None): Uses local memory (single process)

    Elements are fixed-length byte blobs of `element_size`.

    Args:
        name: Shared memory segment name. If None, uses local memory mode.
        size: Queue capacity (must be power of 2). Required when creating or using local mode.
        element_size: Size of each element in bytes. Required.
        create: If True, create new shared memory segment (only for shared memory mode).
    """

    def __init__(self, *, name: Optional[str] = None, size: Optional[int] = None, element_size: Optional[int] = None):
        # Store the original user-provided name (without / prefix)
        # Python's SharedMemory will add the / prefix on POSIX systems automatically.
        # We strip any leading / to avoid double-prefixing (//name) on POSIX systems.
        self.name = name
        if self.name is not None and self.name.startswith('/'):
            # Strip leading / if user provided it - Python's SharedMemory will add it back on POSIX
            self.name = self.name[1:]

        # macOS has a 31-character limit for POSIX shared memory names (including leading /)
        # Check the length that will be used (with / prefix on POSIX systems)
        if self.name is not None and sys.platform == 'darwin':
            # On macOS, Python's SharedMemory will prepend /, so check total length
            final_name = '/' + self.name
            if len(final_name) > 31:
                raise ValueError(f"Shared memory name '{final_name}' is {len(final_name)} characters, "
                               f"but macOS has a 31-character limit. Please use a shorter name.")

        self.use_shm = name is not None
        self._shm: Optional[SharedMemory] = None
        self._local_buf: Optional[bytearray] = None
        self.size = None
        self._own = False
        self._last_published_valid = False
        self._atomic_last_published = None

        # Validate parameters
        if size is not None:
            self.size = int(size)
            if self.size & (self.size - 1):
                raise ValueError("size must be a power of two")
            self.mask = self.size - 1

        if element_size is not None:
            self.element_size = int(element_size)

        if self.use_shm:
            # Shared memory mode (C++ with shm_name != nullptr)
            if self.size:
                # create shared memory
                if element_size is None:
                    raise ValueError("size and element_size required when creating")
                total = HEADER_SIZE + SLOT_SIZE * self.size + self.element_size * self.size
                try:
                    self._shm = SharedMemory(name=self.name, create=True, size=total)
                    # print(f"**** create new shm {self.name}")
                except FileExistsError:
                    # print(f"**** shm already exists, opening {self.name}")
                    self._shm = SharedMemory(name=self.name, create=False)

                # Use CAS on init_state to determine ownership (matches C++ queue.h:618-648)
                buf = self._shm.buf
                init_state_atomic = AtomicUInt64(buf, INIT_STATE_OFFSET)

                # Try to atomically claim ownership by CAS from UNINITIALIZED to INITIALIZING
                success, actual_state = init_state_atomic.compare_exchange_weak(
                    INIT_STATE_UNINITIALIZED, INIT_STATE_INITIALIZING
                )

                if success:
                    # We are the creator - initialize the queue (matches C++ queue.h:622-647)
                    self._own = True

                    # Write header_magic at offset 24
                    struct.pack_into("<I", buf, HEADER_MAGIC_OFFSET, HEADER_MAGIC)

                    # Initialize reserved_info atomic at offset 0
                    atomic_reserved = AtomicReservedInfo(buf, 0)
                    # This stores packed (index=0, size=0)
                    struct.pack_into("<Q", buf, 0, 0)

                    # Initialize last_published at offset 16 with kInvalidIndex
                    struct.pack_into("<Q", buf, LAST_PUBLISHED_OFFSET, K_INVALID_INDEX)
                    self._last_published_valid = True

                    # Write size and element_size at offsets 8 and 12
                    struct.pack_into("<I I", buf, SIZE_OFFSET, self.size, element_size)

                    # Initialize slots data_index to max (uint64 max)
                    for i in range(self.size):
                        off = HEADER_SIZE + i * SLOT_SIZE
                        struct.pack_into(SLOT_FMT, buf, off, K_INVALID_INDEX, 1)

                    # Mark initialization complete
                    init_state_atomic.store_release(INIT_STATE_READY)

                else:
                    # Opened existing - wait for initialization and validate (matches C++ queue.h:649-684)
                    self._own = False

                    # Wait for initialization to complete
                    if not self._wait_for_shared_memory_ready(buf):
                        self._shm.close()
                        raise RuntimeError("Timed out waiting for shared memory initialization")

                    # Detect format version
                    self._last_published_valid = self._detect_format_version(buf)

                    # Read and validate metadata
                    ss = struct.unpack_from("<I I", buf, SIZE_OFFSET)
                    if ss[0] != self.size:
                        self._shm.close()
                        raise ValueError(f"size mismatch. Expected {self.size} but got {ss[0]}")
                    if ss[1] != element_size:
                        self._shm.close()
                        raise ValueError(f"element size mismatch. Expected {element_size} but got {ss[1]}")
            else:
                # print(f"**** open existing shm {self.name}")
                # open existing and read size from header
                if element_size is None:
                    raise ValueError("element_size must be provided when opening existing shared memory")

                # Open existing shared memory (size parameter not needed/ignored)
                self._shm = SharedMemory(name=self.name, create=False)
                buf = self._shm.buf

                # Wait for initialization to complete (matches C++ queue.h:558-562)
                if not self._wait_for_shared_memory_ready(buf):
                    self._shm.close()
                    raise RuntimeError("Timed out waiting for shared memory initialization")

                # Detect format version (matches C++ queue.h:564-570)
                self._last_published_valid = self._detect_format_version(buf)

                # Read actual queue size from header
                ss = struct.unpack_from("<I I", buf, SIZE_OFFSET)
                self.size = ss[0]
                elem_sz = ss[1]

                if element_size != elem_sz:
                    self._shm.close()
                    raise ValueError(f"SharedMemory element_size mismatch. Expecting {element_size} but got {elem_sz}")

                self.mask = self.size - 1
                self.element_size = int(element_size)

            self._buf = self._shm.buf
            self._control_offset = HEADER_SIZE
            self._data_offset = HEADER_SIZE + SLOT_SIZE * self.size

            # Initialize atomic wrappers for lock-free operations
            self._atomic_reserved = AtomicReservedInfo(self._buf, 0)
            self._atomic_slots = []
            for i in range(self.size):
                slot_offset = HEADER_SIZE + i * SLOT_SIZE
                self._atomic_slots.append(AtomicUInt64(self._buf, slot_offset))

            # Initialize last_published atomic if modern format
            if self._last_published_valid:
                self._atomic_last_published = AtomicUInt64(self._buf, LAST_PUBLISHED_OFFSET)
        else:
            # Local memory mode (C++ with shm_name == nullptr)
            if size is None or element_size is None:
                raise ValueError("size and element_size required for local memory mode")

            # Create local buffers (equivalent to C++ new T[size_] and new slot[size_])
            # We use a bytearray to simulate the memory layout
            total = HEADER_SIZE + SLOT_SIZE * self.size + self.element_size * self.size
            self._local_buf = bytearray(total)

            # Initialize header with modern format (local mode always uses modern format)
            self._local_buf[:HEADER_SIZE] = bytes(HEADER_SIZE)
            # Write size at offset 8
            struct.pack_into("<I I", self._local_buf, SIZE_OFFSET, self.size, element_size)
            # Initialize last_published at offset 16 with kInvalidIndex
            struct.pack_into("<Q", self._local_buf, LAST_PUBLISHED_OFFSET, K_INVALID_INDEX)
            # Write header_magic at offset 24
            struct.pack_into("<I", self._local_buf, HEADER_MAGIC_OFFSET, HEADER_MAGIC)
            # Write init_state = READY at offset 48
            struct.pack_into("<I", self._local_buf, INIT_STATE_OFFSET, INIT_STATE_READY)
            self._last_published_valid = True

            # Initialize slots data_index to max
            for i in range(self.size):
                off = HEADER_SIZE + i * SLOT_SIZE
                struct.pack_into(SLOT_FMT, self._local_buf, off, K_INVALID_INDEX, 1)

            # Create a memoryview for consistency with shared memory path
            self._buf = memoryview(self._local_buf)
            self._control_offset = HEADER_SIZE
            self._data_offset = HEADER_SIZE + SLOT_SIZE * self.size

            # Initialize atomic wrappers (these work on local memory too)
            self._atomic_reserved = AtomicReservedInfo(self._buf, 0)
            self._atomic_slots = []
            for i in range(self.size):
                slot_offset = HEADER_SIZE + i * SLOT_SIZE
                self._atomic_slots.append(AtomicUInt64(self._buf, slot_offset))

            # Initialize last_published atomic (local mode always uses modern format)
            self._atomic_last_published = AtomicUInt64(self._buf, LAST_PUBLISHED_OFFSET)

    @staticmethod
    def _wait_for_shared_memory_ready(buf: memoryview) -> bool:
        """
        Wait for shared memory initialization to complete.
        Matches C++ queue.h:510-534.

        Args:
            buf: Memory buffer to check

        Returns:
            True if initialization completed successfully, False if timed out
        """
        import time
        init_state_atomic = AtomicUInt64(buf, INIT_STATE_OFFSET)
        max_wait_ms = 2000
        legacy_grace_ms = 5

        for i in range(max_wait_ms):
            state = init_state_atomic.load_acquire()
            if state == INIT_STATE_READY:
                return True

            if state == INIT_STATE_LEGACY and i >= legacy_grace_ms:
                # Legacy format: check if size and element_size are non-zero
                ss = struct.unpack_from("<I I", buf, SIZE_OFFSET)
                if ss[0] != 0 and ss[1] != 0:
                    return True

            time.sleep(0.001)

        return False

    @staticmethod
    def _detect_format_version(buf: memoryview) -> bool:
        """
        Detect if the queue uses modern format with last_published.
        Matches C++ queue.h:564-570.

        Args:
            buf: Memory buffer to check

        Returns:
            True if modern format (last_published_valid), False for legacy
        """
        init_state_atomic = AtomicUInt64(buf, INIT_STATE_OFFSET)
        state = init_state_atomic.load_acquire()
        if state == INIT_STATE_READY:
            magic = struct.unpack_from("<I", buf, HEADER_MAGIC_OFFSET)[0]
            return magic == HEADER_MAGIC
        return False

    # low-level helpers
    def _read_reserved(self) -> Tuple[int, int]:
        buf = self._buf
        packed = struct.unpack_from(AtomicReservedInfo.RESERVED_INFO_FMT, buf, 0)[0]
        return get_index(packed), get_size(packed)

    def _write_reserved(self, index: int, sz: int) -> None:
        packed = make_reserved_info(int(index), int(sz))
        struct.pack_into(AtomicReservedInfo.RESERVED_INFO_FMT, self._buf, 0, packed)

    def _read_slot(self, idx: int) -> Tuple[int, int]:
        off = self._control_offset + idx * SLOT_SIZE
        data_index, size = struct.unpack_from(SLOT_FMT, self._buf, off)
        return int(data_index), int(size)

    def _write_slot(self, idx: int, data_index: int, size: int) -> None:
        off = self._control_offset + idx * SLOT_SIZE
        struct.pack_into(SLOT_FMT, self._buf, off, int(data_index), int(size))

    def get_shm_name(self) -> Optional[str]:
        """
        Get the actual shared memory name for C++ interop.

        Returns the name with POSIX / prefix (required by C++ shm_open).
        On POSIX systems (Linux/macOS), this returns the name with the / prefix.
        On Windows, it returns the name without modification.

        Returns:
            The shared memory name that C++ code should use to open the queue.
            On POSIX systems, this will have the / prefix that shm_open() requires.
        """
        if self._shm is not None:
            # Use the actual name from SharedMemory (which has / prefix on POSIX)
            return self._shm._name
        elif self.name is not None:
            # If SharedMemory not created yet, construct the expected name
            # On POSIX, need to add / prefix; on Windows, use as-is
            if sys.platform != 'win32':
                return '/' + self.name
            else:
                return self.name
        return None

    # Public API mirroring C++ methods
    def reserve(self, n: int = 1) -> int:
        """
        Reserve space in the queue for writing (multi-producer safe).

        Uses atomic CAS to safely reserve slots from multiple producers.
        Matches C++ queue.h:181-213.

        Args:
            n: Number of slots to reserve (default 1)

        Returns:
            Starting index of reserved space

        Raises:
            RuntimeError: If n > queue size
        """
        if n > self.size:
            raise RuntimeError(f"required size {n} > queue size {self.size}")

        # CAS loop for multi-producer safety (matching C++ line 189-205)
        while True:
            # Load current reserved_info with memory_order_relaxed (C++ line 185)
            reserved_index, reserved_size = self._atomic_reserved.load()

            index = reserved_index
            idx = index & self.mask
            buffer_wrapped = False

            # Check if we need to wrap (C++ lines 194-204)
            if (idx + n) > self.size:
                # Wrap to beginning
                index += self.size - idx
                next_index = index + n
                next_size = n
                buffer_wrapped = True
            else:
                # Normal increment
                next_index = reserved_index + n
                next_size = n

            # Atomic CAS with memory_order_release on success (C++ line 205)
            success, actual = self._atomic_reserved.compare_exchange_weak(
                expected=(reserved_index, reserved_size),
                desired=(next_index, next_size)
            )

            if success:
                # CAS succeeded, we own this reservation
                if buffer_wrapped:
                    # Publish wrap marker (C++ lines 206-211)
                    slot_idx = reserved_index & self.mask
                    self._write_slot(slot_idx, index, n)
                return index

            # CAS failed, retry with updated value

    def publish(self, index: int, n: int = 1) -> None:
        """
        Publish data written to reserved space (atomic with release semantics).

        Makes the data visible to consumers. Matches C++ queue.h:325-338.

        Args:
            index: Index returned by reserve()
            n: Number of slots to publish (default 1)
        """
        slot_idx = index & self.mask

        # Write slot size (non-atomic part)
        size_offset = self._control_offset + slot_idx * SLOT_SIZE + 8
        struct.pack_into("<I 4x", self._buf, size_offset, n)

        # Atomic store of data_index with memory_order_release (C++ line 329)
        # This ensures all data writes are visible before the index is published
        self._atomic_slots[slot_idx].store_release(index)

        # Update last_published if modern format (C++ lines 331-337)
        if self._last_published_valid:
            while True:
                current = self._atomic_last_published.load_acquire()
                # Only update if current is invalid or less than our index
                if current != K_INVALID_INDEX and current >= index:
                    break
                success, _ = self._atomic_last_published.compare_exchange_weak(
                    current, index
                )
                if success:
                    break

    def __getitem__(self, index: int) -> memoryview:
        off = self._data_offset + (index & self.mask) * self.element_size
        return self._buf[off: off + self.element_size]

    def read(self, read_index: Union[int, AtomicCursor]) -> Union[Tuple[Optional[bytes], int, int], Tuple[Optional[bytes], int, int]]:
        """
        Read data from the queue.

        This method has two modes:
        1. Single-consumer mode: read(int) -> (data, size, new_index)
        2. Multi-consumer mode: read(AtomicCursor) -> (data, size)

        Single-consumer mode (matches C++ queue.h:246-273):
            Uses a plain int cursor for single-consumer scenarios.
            Returns the new read_index.

        Multi-consumer mode (matches C++ queue.h:283-314):
            Uses an AtomicCursor for work-stealing/load-balancing across multiple consumers.
            Each consumer atomically claims items, ensuring each item is consumed exactly once.

        Note: Unlike C++, the single-consumer version returns the new read_index rather
        than updating by reference, as Python doesn't have true pass-by-reference.

        Args:
            read_index: Either an int (single-consumer) or AtomicCursor (multi-consumer)

        Returns:
            Single-consumer: Tuple of (data_bytes or None, item_size, new_read_index)
            Multi-consumer: Tuple of (data_bytes or None, item_size)
            If no data available returns (None, 0) or (None, 0, read_index)

        Examples:
            # Single consumer
            read_index = 0
            data, size, read_index = q.read(read_index)

            # Multi-consumer work-stealing
            cursor = AtomicCursor(cursor_shm.buf, 0)
            data, size, index = q.read(cursor)  # Atomically claim next item
        """
        if isinstance(read_index, AtomicCursor):
            return self._read_atomic_cursor(read_index)
        else:
            return self._read_single_consumer(read_index)

    def _read_single_consumer(self, read_index: int) -> Tuple[Optional[bytes], int, int]:
        """
        Single-consumer read with atomic acquire semantics.

        Matches C++ queue.h:246-273. For single-consumer use only.

        Args:
            read_index: Current read position

        Returns:
            Tuple of (data_bytes or None, item_size, new_read_index).
            If no data available returns (None, 0, read_index).
        """
        while True:
            idx = read_index & self.mask

            # Atomic load with memory_order_acquire (C++ line 252)
            data_index = self._atomic_slots[idx].load_acquire()

            # Read slot size (non-atomic part)
            size_offset = self._control_offset + idx * SLOT_SIZE + 8
            slot_size = struct.unpack_from("<I", self._buf, size_offset)[0]

            # Check for queue reset (C++ lines 253-256)
            reserved_index, _ = self._atomic_reserved.load()
            if data_index != (2**64 - 1) and reserved_index < data_index:
                read_index = 0
                continue

            # Check if data is ready (C++ lines 258-261)
            if data_index == (2**64 - 1) or data_index < read_index:
                return None, 0, read_index

            # Check for wrap (C++ lines 262-266)
            if data_index > read_index and ((data_index & self.mask) != idx):
                read_index = data_index
                continue

            # Read data (C++ lines 270-272)
            data_off = self._data_offset + (read_index & self.mask) * self.element_size
            data = bytes(self._buf[data_off: data_off + slot_size * self.element_size])
            new_read_index = data_index + slot_size
            return data, slot_size, new_read_index

    def _read_atomic_cursor(self, read_index: AtomicCursor) -> Tuple[Optional[bytes], int, int]:
        """
        Multi-consumer read using a shared atomic cursor (work-stealing pattern).

        Matches C++ queue.h:283-314. Multiple consumers share a single atomic cursor,
        atomically claiming items to process. Each item is consumed by exactly one consumer.

        Args:
            read_index: Shared AtomicCursor for coordinating multiple consumers

        Returns:
            Tuple of (data_bytes or None, item_size, data_index).
            If no data available returns (None, 0, -1).
        """
        if self._buf is None:
            raise RuntimeError("Queue buffer is not initialized")

        while True:
            # Load current cursor position (C++ line 285)
            current_index = read_index.load()
            idx = current_index & self.mask

            # Load slot data_index (C++ line 288)
            data_index = self._atomic_slots[idx].load_acquire()

            # Read slot size (non-atomic part)
            size_offset = self._control_offset + idx * SLOT_SIZE + 8
            slot_size = struct.unpack_from("<I", self._buf, size_offset)[0]

            # Check for queue reset (C++ lines 290-294)
            reserved_index, _ = self._atomic_reserved.load()
            if data_index != (2**64 - 1) and reserved_index < data_index:
                read_index.store(0)
                continue

            # Check if data is ready (C++ lines 296-299)
            if data_index == (2**64 - 1) or data_index < current_index:
                return None, 0, -1

            # Check for wrap (C++ lines 300-304)
            if data_index > current_index and ((data_index & self.mask) != idx):
                # Try to atomically update cursor to skip wrapped slots
                read_index.compare_exchange_weak(current_index, data_index)
                continue

            # Try to atomically claim this item (C++ lines 306-313)
            next_index = data_index + slot_size
            success, _ = read_index.compare_exchange_weak(current_index, next_index)

            if success:
                # Successfully claimed the item, read and return it
                data_off = self._data_offset + (current_index & self.mask) * self.element_size
                data = bytes(self._buf[data_off: data_off + slot_size * self.element_size])
                return data, slot_size, current_index

            # CAS failed, another consumer claimed it, retry

    def read_last(self) -> Tuple[Optional[bytes], int]:
        """
        Read the last published data in the queue.

        Matches C++ queue.h:439-458.

        Returns:
            Tuple of (data_bytes or None, item_size).
            If no data available returns (None, 0).
        """
        if self._last_published_valid:
            # Modern format: use last_published atomic (C++ lines 440-446)
            last_index = self._atomic_last_published.load_acquire()
            if last_index == K_INVALID_INDEX:
                return None, 0

            # Read slot size from control array
            slot_idx = last_index & self.mask
            size_offset = self._control_offset + slot_idx * SLOT_SIZE + 8
            slot_size = struct.unpack_from("<I", self._buf, size_offset)[0]

            # Read data
            data_off = self._data_offset + slot_idx * self.element_size
            data = bytes(self._buf[data_off: data_off + slot_size * self.element_size])
            return data, slot_size
        else:
            # Legacy format: use reserved_info (C++ lines 449-457)
            reserved_index, reserved_size = self._read_reserved()
            if reserved_index == 0:
                return None, 0
            last_index = reserved_index - reserved_size
            off = self._data_offset + (last_index & self.mask) * self.element_size
            data = bytes(self._buf[off: off + reserved_size * self.element_size])
            return data, reserved_size

    def reset(self) -> None:
        """Reset the queue to its initial state.

        This is a low-level operation that should be used with caution.
        It is typically used in testing or when the queue needs to be reinitialized.
        Matches C++ queue.h:465-477.
        """
        # Reset all slots to their initial state
        for i in range(self.size):
            self._write_slot(i, K_INVALID_INDEX, 1)

        if self.use_shm:
            # Reset reserved_info to initial state
            self._write_reserved(0, 0)

        # Reset last_published if modern format (C++ line 473)
        if self._last_published_valid:
            self._atomic_last_published.store_release(K_INVALID_INDEX)

    def close(self) -> None:
        """Close the queue connection.

        For shared memory mode: releases all references to avoid 'exported pointers exist' errors.
        For local memory mode: releases local buffer.
        """
        try:
            # Release atomic wrapper references to the buffer
            if hasattr(self, '_atomic_reserved') and self._atomic_reserved:
                self._atomic_reserved.release()
            self._atomic_reserved = None

            if hasattr(self, '_atomic_slots') and self._atomic_slots:
                for slot in self._atomic_slots:
                    slot.release()
            self._atomic_slots = None

            # Release last_published atomic if it exists
            if hasattr(self, '_atomic_last_published') and self._atomic_last_published:
                self._atomic_last_published.release()
            self._atomic_last_published = None

            self._buf = None

            # Close shared memory if using it
            if self.use_shm and self._shm:
                try:
                    # prevent Exception ignored in: <function SharedMemory.__del__ at 0x00000176D1BFA8E0>
                    self._shm._mmap = None
                    self._shm.close()
                    self._shm = None
                except Exception:
                    pass

            # Clear local buffer if using it
            if not self.use_shm and self._local_buf:
                self._local_buf = None
        except Exception as e:
            print(e)
            pass

    def unlink(self) -> None:
        """Unlink (delete) the shared memory segment.

        Only applicable for shared memory mode. Does nothing for local memory mode.
        """
        if not self.use_shm:
            return  # Nothing to unlink for local memory

        try:
            if self._shm:
                self._shm.unlink()
        except Exception:
            pass

    def __enter__(self):
        """Context manager entry."""
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):  # noqa: U100
        """Context manager exit - ensures proper cleanup."""
        self.close()
        return False


__all__ = ["SlickQueue", "AtomicCursor", "__version__"]