diff --git a/.github/tests/lm_tests.py b/.github/tests/lm_tests.py
index 0a39b8a9..c6852009 100644
--- a/.github/tests/lm_tests.py
+++ b/.github/tests/lm_tests.py
@@ -8,6 +8,7 @@
 import lotus
 from lotus.models import LM, SentenceTransformersRM
 from lotus.types import CascadeArgs
+from lotus.sem_ops.ensembling import Ensemble, EnsembleConfig, EnsembleStrategy
 from lotus.vector_store import FaissVS
 
 ################################################################################
@@ -499,6 +500,44 @@ def test_filter_cascade(setup_models):
     assert stats["filters_resolved_by_helper_model"] > 0, stats
 
 
+@pytest.mark.skipif(not ENABLE_OPENAI_TESTS, reason="Skipping test because OpenAI tests are not enabled")
+def test_filter_ensembling(setup_models):
+    models = setup_models
+    lotus.settings.configure(lm=models["gpt-4o-mini"])
+
+    data = {
+        "Text": [
+            "I am really excited to go to class today!",
+            "I am very sad",
+        ]
+    }
+    df = pd.DataFrame(data)
+    user_instruction = "{Text} is a positive sentiment"
+
+    # Test with n_sample=2 and majority vote
+    filtered_df = df.sem_filter(
+        user_instruction,
+        n_sample=2,
+        ensemble=Ensemble(EnsembleConfig(strategy=EnsembleStrategy.MAJORITY_VOTE)),
+        return_raw_outputs=True,
+        return_explanations=True,
+        return_all=True
+    )
+
+    # Check for new columns
+    expected_cols = [
+        "raw_output_1", "parsed_output_1", "explanation_1",
+        "raw_output_2", "parsed_output_2", "explanation_2",
+        "filter_label"  # Ensemble result
+    ]
+    for col in expected_cols:
+        assert col in filtered_df.columns, f"Column {col} missing from dataframe"
+    
+    # Check ensemble logic (both samples should be True for first row)
+    assert filtered_df.iloc[0]["filter_label"] == True
+    assert filtered_df.iloc[1]["filter_label"] == False
+
+
 @pytest.mark.skipif(not ENABLE_OPENAI_TESTS, reason="Skipping test because OpenAI tests are not enabled")
 def test_join_cascade(setup_models):
     models = setup_models
diff --git a/.github/tests/multimodality_tests.py b/.github/tests/multimodality_tests.py
index db566a38..d2664ae4 100644
--- a/.github/tests/multimodality_tests.py
+++ b/.github/tests/multimodality_tests.py
@@ -4,7 +4,7 @@
 import pytest
 
 import lotus
-from lotus.dtype_extensions import ImageArray
+from lotus.dtype_extensions import ImageArray, AudioArray
 from lotus.models import LM, SentenceTransformersRM
 from lotus.vector_store import FaissVS
 
@@ -20,6 +20,7 @@
 
 MODEL_NAME_TO_ENABLED = {
     "gpt-4o-mini": ENABLE_OPENAI_TESTS,
+    "gpt-4o-audio-preview": ENABLE_OPENAI_TESTS,
     "clip-ViT-B-32": ENABLE_LOCAL_TESTS,
 }
 ENABLED_MODEL_NAMES = set([model_name for model_name, is_enabled in MODEL_NAME_TO_ENABLED.items() if is_enabled])
@@ -27,6 +28,7 @@
 MODEL_NAME_TO_CLS = {
     "clip-ViT-B-32": SentenceTransformersRM,
     "gpt-4o-mini": LM,
+    "gpt-4o-audio-preview": LM,
 }
 
 
@@ -228,3 +230,25 @@ def test_sim_join_operation_text_index(setup_models, model):
         ("https://i.pinimg.com/236x/a4/3a/65/a43a65683a0314f29b66402cebdcf46d.jpg", "bird"),
     ]
     assert expected_result == list(zip(joined_df["image"], joined_df["element"]))
+
+
+@pytest.mark.parametrize("model", get_enabled("gpt-4o-audio-preview"))
+def test_filter_operation_audio(setup_models, model):
+    lm = setup_models[model]
+    lotus.settings.configure(lm=lm)
+
+    # Use a real wav file content to ensure valid format
+    import base64
+    with open("test_audio.wav", "rb") as f:
+        wav_bytes = f.read()
+    wav_b64 = "data:audio/wav;base64," + base64.b64encode(wav_bytes).decode("utf-8")
+    
+    audio_data = [wav_b64, wav_b64]
+    df = pd.DataFrame({"audio": AudioArray(audio_data)})
+    user_instruction = "{audio} contains audio"
+    
+    # Just verify it runs without error and returns a dataframe
+    filtered_df = df.sem_filter(user_instruction)
+    assert isinstance(filtered_df, pd.DataFrame)
+
+
diff --git a/examples/ensembling_example.py b/examples/ensembling_example.py
new file mode 100644
index 00000000..c0af65b5
--- /dev/null
+++ b/examples/ensembling_example.py
@@ -0,0 +1,98 @@
+"""
+Example: Using Test-Time Scaling (Ensembling) with sem_filter
+
+This example demonstrates how to use the new ensembling feature in sem_filter
+to improve prediction accuracy by aggregating multiple LLM samples.
+"""
+
+import pandas as pd
+
+import lotus
+from lotus.models import LM
+from lotus.sem_ops.ensembling import Ensemble, EnsembleConfig, EnsembleStrategy
+
+# Configure the language model
+lm = LM(model="gpt-4o-mini")
+lotus.settings.configure(lm=lm)
+
+# Create a sample DataFrame with movie reviews
+df = pd.DataFrame({
+    "review": [
+        "This movie was absolutely fantastic! Best film I've seen all year.",
+        "Terrible waste of time. The plot made no sense whatsoever.",
+        "It was okay, had some good moments but also some boring parts.",
+        "A masterpiece of modern cinema. Highly recommend!",
+        "I fell asleep halfway through. Very disappointing.",
+    ]
+})
+
+# Example 1: Basic ensembling with default MAJORITY_VOTE strategy
+print("Example 1: Basic Ensembling (Majority Vote)")
+print("-" * 50)
+
+result = df.sem_filter(
+    "The {review} expresses a positive sentiment",
+    n_sample=3,  # Run 3 samples and aggregate
+)
+
+print(f"Filtered to {len(result)} positive reviews")
+print(result)
+
+# Example 2: Using a custom ensemble configuration
+print("\nExample 2: Custom Ensemble Configuration (Weighted Average)")
+print("-" * 50)
+
+# Create a custom ensemble with weighted average strategy
+config = EnsembleConfig(
+    strategy=EnsembleStrategy.WEIGHTED_AVERAGE,
+    weights=[0.5, 0.3, 0.2],  # Weight earlier samples more heavily
+)
+ensemble = Ensemble(config)
+
+result = df.sem_filter(
+    "The {review} mentions specific plot details",
+    n_sample=3,
+    ensemble=ensemble,
+)
+
+print(f"Filtered to {len(result)} reviews with plot details")
+print(result)
+
+# Example 3: Accessing per-run data
+print("\nExample 3: Accessing Per-Run Data")
+print("-" * 50)
+
+# Use return_all=True to get full output object with per-run details
+result_with_details, stats = df.sem_filter(
+    "The {review} is written in a sarcastic tone",
+    n_sample=5,
+    return_stats=True,
+    return_all=True,  # Return all rows, not just filtered ones
+)
+
+# The output contains predictions from all runs
+# Access via the _raw_outputs attribute
+print("Total samples run: 5")
+print(f"Stats: {stats}")
+print(result_with_details)
+
+# Example 4: Consensus strategy (only returns True if all samples agree)
+print("\nExample 4: Consensus Strategy")
+print("-" * 50)
+
+config = EnsembleConfig(
+    strategy=EnsembleStrategy.CONSENSUS,
+    default=False,  # Default to False if no consensus
+)
+ensemble = Ensemble(config)
+
+result = df.sem_filter(
+    "The {review} contains extremely strong language",
+    n_sample=3,
+    ensemble=ensemble,
+)
+
+print(f"Filtered to {len(result)} reviews (required unanimous agreement)")
+print(result)
+
+print("\nDone!")
diff --git a/lotus/dtype_extensions/__init__.py b/lotus/dtype_extensions/__init__.py
index b836bdb5..5103bcc5 100644
--- a/lotus/dtype_extensions/__init__.py
+++ b/lotus/dtype_extensions/__init__.py
@@ -1,7 +1,9 @@
 from lotus.dtype_extensions.image import ImageDtype, ImageArray
+from lotus.dtype_extensions.audio import AudioDtype, AudioArray
 import pandas as pd
 
 pd.api.extensions.register_extension_dtype(ImageDtype)
+pd.api.extensions.register_extension_dtype(AudioDtype)
 
 
 def convert_to_base_data(data: pd.Series | list) -> list:
@@ -9,13 +11,17 @@ def convert_to_base_data(data: pd.Series | list) -> list:
     Converts data to proper base data type.
     - For original pandas data types, this is returns tolist().
     - For ImageDtype, this returns list of PIL.Image.Image.
+    - For AudioDtype, this returns list of audio data.
     """
     if isinstance(data, pd.Series):
         if isinstance(data.dtype, ImageDtype):
             return [data.array.get_image(i) for i in range(len(data))]
+        if isinstance(data.dtype, AudioDtype):
+            return [data.array.get_audio(i) for i in range(len(data))]
         return data.tolist()
 
     return data
 
 
-__all__ = ["ImageDtype", "ImageArray", "convert_to_base_data"]
+__all__ = ["ImageDtype", "ImageArray", "AudioDtype", "AudioArray", "convert_to_base_data"]
+
diff --git a/lotus/dtype_extensions/audio.py b/lotus/dtype_extensions/audio.py
new file mode 100644
index 00000000..b2981f28
--- /dev/null
+++ b/lotus/dtype_extensions/audio.py
@@ -0,0 +1,533 @@
+"""
+Audio data type extension for LOTUS semantic operators.
+
+This module provides a custom pandas ExtensionDtype and ExtensionArray for
+storing and manipulating audio data within DataFrames. It enables semantic
+operators to process audio files (.wav, .mp3, .mp4, .flac, .ogg) alongside
+other data types.
+
+The implementation mirrors the ImageArray pattern but is specialized for
+audio content, supporting both file paths and base64-encoded audio data.
+
+Example:
+    >>> import pandas as pd
+    >>> from lotus.dtype_extensions import AudioArray
+    >>> 
+    >>> audio_files = ['speech1.wav', 'speech2.mp3', 'music.flac']
+    >>> df = pd.DataFrame({'audio': AudioArray(audio_files)})
+    >>> df.sem_filter("the {audio} contains speech")
+"""
+
+import base64
+import sys
+from pathlib import Path
+from typing import Any, Sequence, Union
+
+import numpy as np
+import pandas as pd
+from pandas.api.extensions import ExtensionArray, ExtensionDtype
+
+# Supported audio formats and their MIME types
+SUPPORTED_AUDIO_FORMATS = {
+    '.wav': 'audio/wav',
+    '.mp3': 'audio/mpeg',
+    '.mp4': 'audio/mp4',
+    '.m4a': 'audio/mp4',
+    '.flac': 'audio/flac',
+    '.ogg': 'audio/ogg',
+    '.webm': 'audio/webm',
+}
+
+
+class AudioDtype(ExtensionDtype):
+    """
+    A custom pandas ExtensionDtype for representing audio data.
+    
+    This dtype allows audio files or audio data to be stored in pandas
+    DataFrames and processed by LOTUS semantic operators.
+    
+    Attributes:
+        name: The string identifier for this dtype ("audio").
+        type: The scalar type for this dtype (bytes for raw audio data).
+        na_value: The missing value representation (None).
+    """
+    
+    name = "audio"
+    type = bytes
+    na_value = None
+    
+    @classmethod
+    def construct_array_type(cls):
+        """
+        Return the array type associated with this dtype.
+        
+        Returns:
+            type: The AudioArray class.
+        """
+        return AudioArray
+
+
+class AudioArray(ExtensionArray):
+    """
+    A pandas ExtensionArray for storing and manipulating audio data.
+    
+    This class allows audio files or audio data references to be stored
+    in a pandas Series or DataFrame column. It supports efficient access,
+    caching, and conversion to various formats for LLM processing.
+    
+    Attributes:
+        _data: The underlying numpy array storing audio file paths or data.
+        _dtype: The AudioDtype instance for this array.
+        _cached_audio: Cache for loaded audio data, keyed by (index, format).
+        
+    Example:
+        >>> audio_arr = AudioArray(['file1.wav', 'file2.mp3'])
+        >>> len(audio_arr)
+        2
+        >>> audio_arr.get_audio(0, audio_format='base64')
+        'data:audio/wav;base64,UklGRi...'
+    """
+    
+    def __init__(self, values):
+        """
+        Initialize the AudioArray.
+        
+        Args:
+            values: The initial values for the array. Can be file paths,
+                    URLs, or base64-encoded audio strings.
+        """
+        self._data = np.asarray(values, dtype=object)
+        self._dtype = AudioDtype()
+        self._cached_audio: dict[tuple[int, str], Union[bytes, str, None]] = {}
+    
+    def __getitem__(self, item: Union[int, slice, Sequence[int]]) -> Any:
+        """
+        Retrieve one or more items from the array.
+        
+        Args:
+            item: The index, slice, or sequence of indices to retrieve.
+            
+        Returns:
+            The audio reference at the given index, or a new AudioArray
+            for slices and sequences.
+        """
+        result = self._data[item]
+        
+        if isinstance(item, (int, np.integer)):
+            return result
+        
+        return AudioArray(result)
+    
+    def __setitem__(
+        self, 
+        key: Union[int, slice, Sequence[int], np.ndarray], 
+        value: Any
+    ) -> None:
+        """
+        Set one or more values in the array, invalidating cache entries.
+        
+        Args:
+            key: The index, slice, sequence, or boolean mask to set.
+            value: The value or values to assign.
+        """
+        # Normalize key to a list of indices
+        if isinstance(key, np.ndarray):
+            if key.dtype == bool:
+                key = np.where(key)[0]
+            key = key.tolist()
+        if isinstance(key, (int, np.integer)):
+            key = [key]
+        if isinstance(key, slice):
+            key = range(*key.indices(len(self)))
+        
+        # Handle iterable values
+        if hasattr(value, "__iter__") and not isinstance(value, (str, bytes)):
+            for idx, val in zip(key, value):
+                self._data[idx] = val
+                self._invalidate_cache(idx)
+        else:
+            for idx in key:
+                self._data[idx] = value
+                self._invalidate_cache(idx)
+    
+    def _invalidate_cache(self, idx: int) -> None:
+        """
+        Remove cached audio data for the specified index.
+        
+        Args:
+            idx: The index to invalidate in the cache.
+        """
+        keys_to_remove = [k for k in self._cached_audio if k[0] == idx]
+        for key in keys_to_remove:
+            del self._cached_audio[key]
+    
+    def get_audio(
+        self, 
+        idx: int, 
+        audio_format: str = "base64"
+    ) -> Union[bytes, str, None]:
+        """
+        Fetch and return audio data for the given index.
+        
+        Supports caching to avoid repeated file reads or conversions.
+        
+        Args:
+            idx: The index of the audio to fetch.
+            audio_format: The format to return ("base64" or "bytes").
+            
+        Returns:
+            The audio data in the requested format, or None if unavailable.
+            
+        Raises:
+            ValueError: If the audio format is not supported.
+        """
+        cache_key = (idx, audio_format)
+        
+        if cache_key in self._cached_audio:
+            return self._cached_audio[cache_key]
+        
+        raw_value = self._data[idx]
+        if raw_value is None:
+            return None
+        
+        # Load and convert the audio
+        audio_data = self._load_audio(raw_value, audio_format)
+        self._cached_audio[cache_key] = audio_data
+        
+        return audio_data
+    
+    def _load_audio(
+        self, 
+        value: Any, 
+        audio_format: str
+    ) -> Union[bytes, str, None]:
+        """
+        Load audio data from various source types.
+        
+        Args:
+            value: The audio source (file path, URL, or base64 string).
+            audio_format: The desired output format.
+            
+        Returns:
+            The audio data in the requested format.
+        """
+        if value is None:
+            return None
+        
+        # Handle file paths
+        if isinstance(value, (str, Path)):
+            path = Path(value)
+            if path.exists() and path.is_file():
+                return self._load_from_file(path, audio_format)
+            
+            # Check if it's already a base64 data URI
+            if isinstance(value, str) and value.startswith("data:audio"):
+                if audio_format == "base64":
+                    return value
+                return self._decode_base64_uri(value)
+        
+        # Handle raw bytes
+        if isinstance(value, bytes):
+            if audio_format == "bytes":
+                return value
+            return self._encode_to_base64(value, "audio/octet-stream")
+        
+        return None
+    
+    def _load_from_file(
+        self, 
+        path: Path, 
+        audio_format: str
+    ) -> Union[bytes, str, None]:
+        """
+        Load audio data from a file path.
+        
+        Args:
+            path: The path to the audio file.
+            audio_format: The desired output format.
+            
+        Returns:
+            The audio data in the requested format.
+        """
+        suffix = path.suffix.lower()
+        mime_type = SUPPORTED_AUDIO_FORMATS.get(suffix, "audio/octet-stream")
+        
+        try:
+            with open(path, "rb") as f:
+                audio_bytes = f.read()
+            
+            if audio_format == "bytes":
+                return audio_bytes
+            
+            return self._encode_to_base64(audio_bytes, mime_type)
+            
+        except (IOError, OSError):
+            # Log the error but don't crash - return None for missing files
+            return None
+    
+    def _encode_to_base64(self, audio_bytes: bytes, mime_type: str) -> str:
+        """
+        Encode raw audio bytes to a base64 data URI.
+        
+        Args:
+            audio_bytes: The raw audio data.
+            mime_type: The MIME type of the audio.
+            
+        Returns:
+            A base64-encoded data URI string.
+        """
+        encoded = base64.b64encode(audio_bytes).decode("utf-8")
+        return f"data:{mime_type};base64,{encoded}"
+    
+    def _decode_base64_uri(self, uri: str) -> bytes:
+        """
+        Decode a base64 data URI to raw bytes.
+        
+        Args:
+            uri: The base64 data URI string.
+            
+        Returns:
+            The decoded raw audio bytes.
+        """
+        # Extract the base64 portion after the header
+        if ";base64," in uri:
+            _, encoded = uri.split(";base64,", 1)
+            return base64.b64decode(encoded)
+        return b""
+    
+    def isna(self) -> np.ndarray:
+        """
+        Detect missing values in the array.
+        
+        Returns:
+            Boolean array indicating missing values.
+        """
+        return pd.isna(self._data)
+    
+    def take(
+        self, 
+        indices: Sequence[int], 
+        allow_fill: bool = False, 
+        fill_value=None
+    ) -> "AudioArray":
+        """
+        Take elements from the array by index.
+        
+        Args:
+            indices: Indices of elements to take.
+            allow_fill: If True, -1 indices indicate fill positions.
+            fill_value: Value to use for fill positions.
+            
+        Returns:
+            A new AudioArray with the selected elements.
+        """
+        result = self._data.take(indices, axis=0)
+        if allow_fill and fill_value is not None:
+            result[np.asarray(indices) == -1] = fill_value
+        return AudioArray(result)
+    
+    def copy(self) -> "AudioArray":
+        """
+        Return a copy of the array, including the cache.
+        
+        Returns:
+            A new AudioArray with copied data.
+        """
+        new_array = AudioArray(self._data.copy())
+        new_array._cached_audio = self._cached_audio.copy()
+        return new_array
+    
+    @classmethod
+    def _concat_same_type(cls, to_concat: Sequence["AudioArray"]) -> "AudioArray":
+        """
+        Concatenate multiple AudioArray instances.
+        
+        Args:
+            to_concat: Sequence of AudioArray instances to concatenate.
+            
+        Returns:
+            A new AudioArray containing all elements.
+        """
+        combined_data = np.concatenate([arr._data for arr in to_concat])
+        return cls._from_sequence(combined_data)
+    
+    @classmethod
+    def _from_sequence(cls, scalars, dtype=None, copy=False):
+        """
+        Construct a new AudioArray from a sequence of scalars.
+        
+        Args:
+            scalars: The input sequence of audio references.
+            dtype: Ignored (for API compatibility).
+            copy: If True, copy the input data.
+            
+        Returns:
+            A new AudioArray instance.
+        """
+        if copy:
+            scalars = np.array(scalars, dtype=object, copy=True)
+        return cls(scalars)
+    
+    def __len__(self) -> int:
+        """Return the number of elements in the array."""
+        return len(self._data)
+    
+    def __eq__(self, other) -> np.ndarray:  # type: ignore
+        """
+        Compare this AudioArray to another object for equality.
+        
+        Args:
+            other: Another AudioArray, sequence, or scalar to compare.
+            
+        Returns:
+            Boolean array indicating element-wise equality.
+        """
+        if isinstance(other, AudioArray):
+            return self._data == other._data
+        
+        if hasattr(other, "__iter__") and not isinstance(other, str):
+            if len(other) != len(self):
+                return np.repeat(False, len(self))
+            return np.array([a == b for a, b in zip(self._data, other)], dtype=bool)
+        
+        return np.array([a == other for a in self._data], dtype=bool)
+    
+    @property
+    def dtype(self) -> AudioDtype:
+        """Return the dtype for this array."""
+        return self._dtype
+    
+    @property
+    def nbytes(self) -> int:
+        """
+        Return the total memory consumption of the array elements.
+        
+        Returns:
+            Total bytes consumed by the stored audio references.
+        """
+        return sum(sys.getsizeof(item) for item in self._data if item)
+    
+    def __repr__(self) -> str:
+        """Return a string representation of the AudioArray."""
+        preview = ", ".join([
+            f"<Audio: {self._get_audio_info(item)}>" if item else "None"
+            for item in self._data[:5]
+        ])
+        suffix = ", ..." if len(self._data) > 5 else ""
+        return f"AudioArray([{preview}{suffix}])"
+    
+    def _get_audio_info(self, item: Any) -> str:
+        """
+        Get a brief description of an audio item.
+        
+        Args:
+            item: The audio reference to describe.
+            
+        Returns:
+            A short string describing the audio item.
+        """
+        if isinstance(item, (str, Path)):
+            path = Path(item)
+            if path.suffix.lower() in SUPPORTED_AUDIO_FORMATS:
+                return path.name
+            if str(item).startswith("data:audio"):
+                return "base64"
+        if isinstance(item, bytes):
+            return f"{len(item)} bytes"
+        return str(type(item).__name__)
+    
+    def _formatter(self, boxed: bool = False):
+        """
+        Return a formatter function for displaying array elements.
+        
+        Args:
+            boxed: Whether to use a boxed formatter (unused).
+            
+        Returns:
+            A function that formats an element for display.
+        """
+        return lambda x: f"<Audio: {self._get_audio_info(x)}>" if x else "None"
+    
+    def to_numpy(self, dtype=None, copy=False, na_value=None) -> np.ndarray:
+        """
+        Convert the AudioArray to a numpy array.
+        
+        Args:
+            dtype: Ignored (for API compatibility).
+            copy: If True, return a copy of the data.
+            na_value: Ignored (for API compatibility).
+            
+        Returns:
+            A numpy array of audio references.
+        """
+        if copy:
+            return self._data.copy()
+        return self._data
+    
+    def __array__(self, dtype=None) -> np.ndarray:
+        """
+        Numpy array interface for AudioArray.
+        
+        Args:
+            dtype: Ignored (for API compatibility).
+            
+        Returns:
+            A numpy array of audio references.
+        """
+        return self.to_numpy(dtype=dtype)
+    
+    def get_duration(self, idx: int) -> float | None:
+        """
+        Get the duration of an audio file in seconds.
+        
+        This method attempts to read the duration without loading
+        the entire audio file into memory when possible.
+        
+        Args:
+            idx: The index of the audio to get duration for.
+            
+        Returns:
+            Duration in seconds, or None if unavailable.
+            
+        Note:
+            This requires the audio file to be accessible on disk.
+            Duration detection may not work for all formats.
+        """
+        raw_value = self._data[idx]
+        if raw_value is None:
+            return None
+        
+        if isinstance(raw_value, (str, Path)):
+            path = Path(raw_value)
+            if path.exists():
+                # Basic duration estimation based on file size
+                # More accurate duration requires format-specific libraries
+                return None  # Placeholder for format-specific implementation
+        
+        return None
+    
+    def get_mime_type(self, idx: int) -> str | None:
+        """
+        Get the MIME type of an audio file.
+        
+        Args:
+            idx: The index of the audio to get MIME type for.
+            
+        Returns:
+            The MIME type string, or None if unavailable.
+        """
+        raw_value = self._data[idx]
+        if raw_value is None:
+            return None
+        
+        if isinstance(raw_value, (str, Path)):
+            path = Path(raw_value)
+            suffix = path.suffix.lower()
+            return SUPPORTED_AUDIO_FORMATS.get(suffix)
+        
+        if isinstance(raw_value, str) and raw_value.startswith("data:"):
+            # Extract MIME type from data URI
+            if ";" in raw_value:
+                return raw_value.split(";")[0].replace("data:", "")
+        
+        return None
diff --git a/lotus/sem_ops/__init__.py b/lotus/sem_ops/__init__.py
index 4857f42f..8d5e6a45 100644
--- a/lotus/sem_ops/__init__.py
+++ b/lotus/sem_ops/__init__.py
@@ -12,4 +12,6 @@
     "sem_cluster_by",
     "sem_partition_by",
     "sem_dedup",
+    "ensembling",
 ]
+
diff --git a/lotus/sem_ops/ensembling.py b/lotus/sem_ops/ensembling.py
new file mode 100644
index 00000000..d828f750
--- /dev/null
+++ b/lotus/sem_ops/ensembling.py
@@ -0,0 +1,265 @@
+"""
+Ensembling strategies for test-time scaling in semantic operations.
+
+This module provides various ensembling strategies that can be used to improve
+the accuracy and robustness of semantic operators by combining multiple samples
+from the language model.
+
+Strategies implemented:
+    - majority_vote: Takes the most common result across samples
+    - weighted_average: Weighs predictions by confidence scores
+    - consensus: Returns result only if all samples agree
+    - confidence_threshold: Uses majority vote with minimum confidence
+
+Example:
+    >>> from lotus.sem_ops.ensembling import Ensemble
+    >>> ensemble = Ensemble(strategy='majority_vote', n_samples=3)
+    >>> results = ensemble.aggregate(sample_outputs)
+"""
+
+from collections import Counter
+from dataclasses import dataclass
+from enum import Enum
+from typing import Any
+
+
+class EnsembleStrategy(Enum):
+    """Available ensembling strategies for test-time scaling."""
+    
+    MAJORITY_VOTE = "majority_vote"
+    WEIGHTED_AVERAGE = "weighted_average"
+    CONSENSUS = "consensus"
+    CONFIDENCE_THRESHOLD = "confidence_threshold"
+
+
+@dataclass
+class EnsembleConfig:
+    """
+    Configuration for ensemble-based test-time scaling.
+    
+    Attributes:
+        strategy: The ensembling strategy to use.
+        weights: Optional weights for weighted averaging strategy.
+        default: Default value for consensus strategy when no agreement.
+        confidence_threshold: Minimum confidence required for confidence-based strategies.
+    """
+    
+    strategy: EnsembleStrategy = EnsembleStrategy.MAJORITY_VOTE
+    weights: list[float] | None = None
+    default: Any = None
+    confidence_threshold: float = 0.6
+
+
+def majority_vote(samples: list[Any]) -> Any:
+    """
+    Return the most common value from a list of samples.
+    
+    Uses Counter to find the mode. In case of ties, returns the first
+    value that reaches the maximum count (deterministic ordering).
+    
+    Args:
+        samples: List of sample predictions to aggregate.
+        
+    Returns:
+        The most frequently occurring value in the samples.
+        
+    Example:
+        >>> majority_vote([True, True, False])
+        True
+        >>> majority_vote(['cat', 'dog', 'cat'])
+        'cat'
+    """
+    if not samples:
+        raise ValueError("Cannot compute majority vote on empty list")
+    
+    counter = Counter(samples)
+    return counter.most_common(1)[0][0]
+
+
+def weighted_average(samples: list[bool], weights: list[float] | None = None) -> bool:
+    """
+    Compute a weighted vote for boolean predictions.
+    
+    For boolean outputs, calculates the weighted sum and returns True
+    if the weighted proportion of True values exceeds 0.5.
+    
+    Args:
+        samples: List of boolean predictions.
+        weights: Optional list of weights for each sample. If None,
+                 uniform weights are used.
+                 
+    Returns:
+        True if weighted average exceeds 0.5, False otherwise.
+        
+    Example:
+        >>> weighted_average([True, True, False], [0.8, 0.6, 0.4])
+        True
+    """
+    if not samples:
+        raise ValueError("Cannot compute weighted average on empty list")
+    
+    if weights is None:
+        weights = [1.0] * len(samples)
+    
+    if len(samples) != len(weights):
+        raise ValueError("Samples and weights must have the same length")
+    
+    total_weight = sum(weights)
+    if total_weight == 0:
+        raise ValueError("Total weight cannot be zero")
+    
+    weighted_sum = sum(w * (1.0 if s else 0.0) for s, w in zip(samples, weights))
+    return weighted_sum / total_weight > 0.5
+
+
+def consensus(samples: list[Any], default: Any = None) -> Any:
+    """
+    Return the result only if all samples agree.
+    
+    Provides high confidence results by requiring unanimous agreement.
+    Returns the default value if samples disagree.
+    
+    Args:
+        samples: List of sample predictions.
+        default: Value to return if no consensus is reached.
+        
+    Returns:
+        The unanimous value if all samples agree, otherwise the default.
+        
+    Example:
+        >>> consensus([True, True, True])
+        True
+        >>> consensus([True, True, False], default=None)
+        None
+    """
+    if not samples:
+        raise ValueError("Cannot compute consensus on empty list")
+    
+    first_value = samples[0]
+    if all(s == first_value for s in samples):
+        return first_value
+    return default
+
+
+def confidence_threshold(
+    samples: list[Any], 
+    threshold: float = 0.6
+) -> tuple[Any, float]:
+    """
+    Use majority vote with confidence tracking.
+    
+    Returns the majority vote result along with the confidence score,
+    which is the proportion of samples that agree with the result.
+    
+    Args:
+        samples: List of sample predictions.
+        threshold: Minimum proportion required for confidence.
+        
+    Returns:
+        Tuple of (result, confidence). If confidence is below threshold,
+        the result may be less reliable.
+        
+    Example:
+        >>> confidence_threshold([True, True, False])
+        (True, 0.667)
+    """
+    if not samples:
+        raise ValueError("Cannot compute confidence on empty list")
+    
+    counter = Counter(samples)
+    most_common_value, count = counter.most_common(1)[0]
+    confidence = count / len(samples)
+    
+    return most_common_value, confidence
+
+
+class Ensemble:
+    """
+    Manages test-time scaling through ensemble predictions.
+    
+    This class provides a unified interface for applying various ensembling
+    strategies to improve the accuracy of semantic operator predictions.
+    
+    Attributes:
+        config: Configuration object with ensemble parameters.
+        
+    Example:
+        >>> config = EnsembleConfig(n_samples=5, strategy=EnsembleStrategy.MAJORITY_VOTE)
+        >>> ensemble = Ensemble(config)
+        >>> samples = [True, True, True, False, True]
+        >>> result = ensemble.aggregate(samples)
+        >>> print(result)  # True
+    """
+    
+    def __init__(self, config: EnsembleConfig | None = None):
+        """
+        Initialize the ensemble with the given configuration.
+        
+        Args:
+            config: Ensemble configuration. If None, uses default settings.
+        """
+        self.config = config or EnsembleConfig()
+    
+    def aggregate(
+        self, 
+        samples: list[Any], 
+    ) -> Any:
+        """
+        Aggregate multiple samples using the configured strategy.
+        
+        Args:
+            samples: List of predictions to aggregate.
+            weights: Optional weights for weighted strategies.
+            default: Default value for consensus strategy.
+            
+        Returns:
+            The aggregated prediction result.
+            
+        Raises:
+            ValueError: If the configured strategy is not recognized.
+        """
+        strategy = self.config.strategy
+        
+        if strategy == EnsembleStrategy.MAJORITY_VOTE:
+            return majority_vote(samples)
+            
+        elif strategy == EnsembleStrategy.WEIGHTED_AVERAGE:
+            if not all(isinstance(s, bool) for s in samples):
+                # Fall back to majority vote for non-boolean types
+                return majority_vote(samples)
+            return weighted_average(samples, self.config.weights)
+            
+        elif strategy == EnsembleStrategy.CONSENSUS:
+            return consensus(samples, default=self.config.default)
+            
+        elif strategy == EnsembleStrategy.CONFIDENCE_THRESHOLD:
+            result, confidence = confidence_threshold(
+                samples, 
+                threshold=self.config.confidence_threshold
+            )
+            return result
+            
+        else:
+            raise ValueError(f"Unknown ensemble strategy: {strategy}")
+    
+    def aggregate_batch(
+        self, 
+        batch_samples: list[list[Any]], 
+    ) -> list[Any]:
+        """
+        Aggregate samples for a batch of inputs.
+        
+        Args:
+            batch_samples: List of sample lists, one per input.
+            weights: Optional weights for each sample in each input.
+            default: Default value for consensus strategy.
+            
+        Returns:
+            List of aggregated predictions, one per input.
+        """
+        results = []
+        for i, samples in enumerate(batch_samples):
+            # For now, we assume uniform weights across the batch if configured globally
+            # Realistically, per-item weights would need a different config structure
+            results.append(self.aggregate(samples))
+        return results
diff --git a/lotus/sem_ops/sem_filter.py b/lotus/sem_ops/sem_filter.py
index 3a26506d..f0ac5bff 100644
--- a/lotus/sem_ops/sem_filter.py
+++ b/lotus/sem_ops/sem_filter.py
@@ -9,15 +9,16 @@
 from lotus.templates import task_instructions
 from lotus.types import (
     CascadeArgs,
-    LMOutput,
     LogprobsForFilterCascade,
     ProxyModel,
+    RawOutputs,
     ReasoningStrategy,
     SemanticFilterOutput,
 )
 from lotus.utils import show_safe_mode
 
 from .cascade_utils import calibrate_llm_logprobs, importance_sampling, learn_cascade_thresholds
+from .ensembling import Ensemble, EnsembleConfig, EnsembleStrategy
 from .postprocessors import filter_postprocess
 
 
@@ -35,6 +36,8 @@ def sem_filter(
     show_progress_bar: bool = True,
     progress_bar_desc: str = "Filtering",
     additional_cot_instructions: str = "",
+    n_sample: int = 1,
+    ensemble: Ensemble | None = None,
 ) -> SemanticFilterOutput:
     """
     Filters a list of documents based on a natural language instruction using a language model.
@@ -73,21 +76,30 @@ def sem_filter(
             Defaults to "Filtering".
         additional_cot_instructions (str, optional): Additional instructions for
             chain-of-thought reasoning. Defaults to "".
-
+        n_sample (int, optional): Number of samples to generate for test-time scaling.
+            When > 1, multiple predictions are made and aggregated. Defaults to 1.
+        ensemble (Ensemble | None, optional): The ensemble object to use for aggregation
+            when n_sample > 1. If None and n_sample > 1, defaults to MAJORITY_VOTE.
+    
     Returns:
-        SemanticFilterOutput: An object containing the boolean filter outputs, raw
-            outputs, explanations (if applicable), and log probabilities (if requested).
+        SemanticFilterOutput: An object containing the boolean filter outputs, 
+            per-run rollout data, and stats.
 
     Raises:
-        ValueError: If the model is not properly configured or if there are
-            issues with the input parameters.
+        ValueError: If the model is not properly configured, if n_sample < 1,
+            or if there are issues with the input parameters.
 
     Example:
         >>> docs = [{"text": "Positive review"}, {"text": "Negative review"}]
         >>> model = LM(model="gpt-4o")
         >>> result = sem_filter(docs, model, "Is this a positive sentiment?")
         >>> print(result.outputs)  # [True, False]
+        >>> # With test-time scaling
+        >>> result = sem_filter(docs, model, "Is this positive?", n_sample=3, ensemble=EnsembleStrategy.MAJORITY_VOTE)
     """
+    if n_sample < 1:
+        raise ValueError("n_sample must be at least 1")
+
     inputs = []
     for doc in docs:
         prompt = lotus.templates.task_instructions.filter_formatter(
@@ -102,30 +114,61 @@ def sem_filter(
         )
         lotus.logger.debug(f"input to model: {prompt}")
         inputs.append(prompt)
+    
     kwargs: dict[str, Any] = {"logprobs": logprobs}
-
+    
     if safe_mode:
-        estimated_total_calls = len(docs)
-        estimated_total_cost = sum(model.count_tokens(input) for input in inputs)
+        estimated_total_calls = len(docs) * n_sample
+        estimated_total_cost = sum(model.count_tokens(input) for input in inputs) * n_sample
         show_safe_mode(estimated_total_cost, estimated_total_calls)
 
-    lm_output: LMOutput = model(
-        inputs, show_progress_bar=show_progress_bar, progress_bar_desc=progress_bar_desc, **kwargs
-    )
+    # Multi-sample path with ensembling
+    if ensemble is None:
+        ensemble_obj = Ensemble(EnsembleConfig(strategy=EnsembleStrategy.MAJORITY_VOTE))
+    else:
+        ensemble_obj = ensemble
+
+    # Collect all run outputs
+    all_runs_outputs: list[list[bool]] = [[] for _ in range(len(docs))]
+    all_runs_raw_outputs: list[list[str]] = [[] for _ in range(len(docs))]
+    all_runs_explanations: list[list[str | None]] = [[] for _ in range(len(docs))]
+    all_runs_logprobs: list[list[list]] = [[] for _ in range(len(docs))] if logprobs else None
+
+    # Run n_sample times
+    for sample_idx in range(n_sample):
+        desc = f"{progress_bar_desc} (sample {sample_idx + 1}/{n_sample})"
+        lm_output = model(
+            inputs, show_progress_bar=show_progress_bar, progress_bar_desc=desc, **kwargs
+        )
+
+        postprocess_output = filter_postprocess(lm_output.outputs, model, default)
+
+        # Collect outputs for each document
+        for doc_idx in range(len(docs)):
+            all_runs_outputs[doc_idx].append(postprocess_output.outputs[doc_idx])
+            all_runs_raw_outputs[doc_idx].append(postprocess_output.raw_outputs[doc_idx])
+            all_runs_explanations[doc_idx].append(postprocess_output.explanations[doc_idx])
+            if logprobs and lm_output.logprobs:
+                all_runs_logprobs[doc_idx].append(lm_output.logprobs[doc_idx])
+
+    # Aggregate using ensemble strategy
+    final_outputs = ensemble_obj.aggregate_batch(all_runs_outputs)
 
-    postprocess_output = filter_postprocess(lm_output.outputs, model, default)
-    lotus.logger.debug(f"outputs: {postprocess_output.outputs}")
-    lotus.logger.debug(f"raw_outputs: {postprocess_output.raw_outputs}")
-    lotus.logger.debug(f"explanations: {postprocess_output.explanations}")
+    raw_outputs_obj = RawOutputs(
+        predictions=all_runs_outputs,
+        raw_outputs=all_runs_raw_outputs,
+        explanations=all_runs_explanations,
+        logprobs=all_runs_logprobs,
+    )
 
+    lotus.logger.debug(f"outputs: {final_outputs}")
+    
     if safe_mode:
         model.print_total_usage()
 
     return SemanticFilterOutput(
-        raw_outputs=postprocess_output.raw_outputs,
-        outputs=postprocess_output.outputs,
-        explanations=postprocess_output.explanations,
-        logprobs=lm_output.logprobs if logprobs else None,
+        outputs=final_outputs,
+        _raw_outputs=raw_outputs_obj,
     )
 
 
@@ -347,6 +390,8 @@ def __call__(
         safe_mode: bool = False,
         progress_bar_desc: str = "Filtering",
         additional_cot_instructions: str = "",
+        n_sample: int = 1,
+        ensemble: Ensemble | None = None,
     ) -> pd.DataFrame | tuple[pd.DataFrame, dict[str, Any]]:
         if lotus.settings.lm is None:
             raise ValueError(
@@ -543,49 +588,111 @@ def __call__(
                 show_progress_bar=True,
                 progress_bar_desc=progress_bar_desc,
                 additional_cot_instructions=additional_cot_instructions,
+                n_sample=n_sample,
+                ensemble=ensemble,
             )
+        if n_sample > 1:
+            # Multi-sample logic
             outputs = output.outputs
-            raw_outputs = output.raw_outputs
-            explanations = output.explanations
-
-        if not return_all:
-            # find indices where output is True
-            ids = [i for i, x in enumerate(outputs) if x]
-            idx_ids = [self._obj.index[i] for i, x in enumerate(outputs) if x]
-            lotus.logger.debug(f"ids: {ids}")
-            lotus.logger.debug(f"idx_ids: {idx_ids}")
+            raw_outputs_obj = output._raw_outputs
+
+            if not return_all:
+                ids = [i for i, x in enumerate(outputs) if x]
+                idx_ids = [self._obj.index[i] for i, x in enumerate(outputs) if x]
+                new_df = self._obj.iloc[ids].copy()
+                new_df.attrs["index_dirs"] = self._obj.attrs.get("index_dirs", None)
+            else:
+                new_df = self._obj.copy()
+                
+                def get_out_col_name(df, col_name):
+                    if col_name in df.columns:
+                        i = 1
+                        while f"{col_name}_{i}" in new_df.columns:
+                            i += 1
+                        return f"{col_name}_{i}"
+                    else:
+                        return col_name
+                        
+                new_df[get_out_col_name(new_df, "filter_label")] = outputs
+
+            # Add columns for each sample
+            for i in range(n_sample):
+                # 1-based indexing for columns as requested
+                suffix_i = f"_{i+1}"
+                
+                # Extract data for this sample
+                sample_preds = [preds[i] for preds in raw_outputs_obj.predictions]
+                sample_raw = [raws[i] for raws in raw_outputs_obj.raw_outputs]
+                sample_expl = [expls[i] for expls in raw_outputs_obj.explanations]
+
+                # Filter if needed
+                if not return_all:
+                    sample_preds = [sample_preds[j] for j in ids]
+                    sample_raw = [sample_raw[j] for j in ids]
+                    sample_expl = [sample_expl[j] for j in ids]
+                
+                # Add columns
+                if return_raw_outputs:
+                    new_df[f"raw_output{suffix_i}"] = sample_raw
+                    new_df[f"parsed_output{suffix_i}"] = sample_preds
+                if return_explanations:
+                    new_df[f"explanation{suffix_i}"] = sample_expl
+
+            # Add ensemble answer
+            if return_explanations and return_raw_outputs:
+                 # Usually explanation for ensemble might be aggregate or empty, 
+                 # but current logic returns the chosen/final one.
+                 # The sem_filter function returns `final_outputs` as `outputs`.
+                 # For now, we don't have a separate "ensemble explanation", 
+                 # but we can omit or keep existing behavior if applicable.
+                 # The user request specifically asked for the broken down columns.
+                 pass
 
-            [outputs[i] for i in ids]
-            filtered_explanations = [explanations[i] for i in ids]
-            filtered_raw_outputs = [raw_outputs[i] for i in ids]
-            lotus.logger.debug(f"filtered_raw_outputs: {filtered_raw_outputs}")
-
-            new_df = self._obj.iloc[ids]
-            new_df.attrs["index_dirs"] = self._obj.attrs.get("index_dirs", None)
         else:
+            # Single sample logic (backward compatibility)
+            outputs = output.outputs
+            # Access raw_outputs via backward compatibility property
+            raw_outputs = output.raw_outputs
+            explanations = output.explanations
 
-            def get_out_col_name(df, col_name):
-                if col_name in df.columns:
-                    i = 1
-                    while f"{col_name}_{i}" in new_df.columns:
-                        i += 1
-                    return f"{col_name}_{i}"
-                else:
-                    return col_name
-
-            new_df = self._obj.copy()
-            new_df[get_out_col_name(new_df, "filter_label")] = outputs
-            filtered_explanations = explanations
-            filtered_raw_outputs = raw_outputs
-
-        # return rows where output is True
-        if return_explanations and return_raw_outputs:
-            new_df["explanation" + suffix] = filtered_explanations
-            new_df["raw_output" + suffix] = filtered_raw_outputs
-        elif return_explanations:
-            new_df["explanation" + suffix] = filtered_explanations
-        elif return_raw_outputs:
-            new_df["raw_output" + suffix] = filtered_raw_outputs
+            if not return_all:
+                # find indices where output is True
+                ids = [i for i, x in enumerate(outputs) if x]
+                idx_ids = [self._obj.index[i] for i, x in enumerate(outputs) if x]
+                lotus.logger.debug(f"ids: {ids}")
+                lotus.logger.debug(f"idx_ids: {idx_ids}")
+
+                [outputs[i] for i in ids]
+                filtered_explanations = [explanations[i] for i in ids]
+                filtered_raw_outputs = [raw_outputs[i] for i in ids]
+                lotus.logger.debug(f"filtered_raw_outputs: {filtered_raw_outputs}")
+
+                new_df = self._obj.iloc[ids]
+                new_df.attrs["index_dirs"] = self._obj.attrs.get("index_dirs", None)
+            else:
+
+                def get_out_col_name(df, col_name):
+                    if col_name in df.columns:
+                        i = 1
+                        while f"{col_name}_{i}" in new_df.columns:
+                            i += 1
+                        return f"{col_name}_{i}"
+                    else:
+                        return col_name
+
+                new_df = self._obj.copy()
+                new_df[get_out_col_name(new_df, "filter_label")] = outputs
+                filtered_explanations = explanations
+                filtered_raw_outputs = raw_outputs
+
+            # return rows where output is True
+            if return_explanations and return_raw_outputs:
+                new_df["explanation" + suffix] = filtered_explanations
+                new_df["raw_output" + suffix] = filtered_raw_outputs
+            elif return_explanations:
+                new_df["explanation" + suffix] = filtered_explanations
+            elif return_raw_outputs:
+                new_df["raw_output" + suffix] = filtered_raw_outputs
 
         if return_stats:
             return new_df, stats
diff --git a/lotus/templates/task_instructions.py b/lotus/templates/task_instructions.py
index 6ab05139..06e05fa7 100644
--- a/lotus/templates/task_instructions.py
+++ b/lotus/templates/task_instructions.py
@@ -4,7 +4,7 @@
 import pandas as pd
 
 import lotus
-from lotus.dtype_extensions import ImageDtype
+from lotus.dtype_extensions import AudioDtype, ImageDtype
 from lotus.types import ReasoningStrategy, SerializationFormat
 
 
@@ -39,11 +39,24 @@ def non_cot_prompt_formatter(answer_instructions: str = "") -> str:
 
 def context_formatter(
     multimodal_data: dict[str, Any] | str,
-) -> tuple[str, list[dict[str, str]]]:
+) -> tuple[str, list[dict[str, Any]], list[dict[str, Any]]]:
+    """
+    Format multimodal data into text, image inputs, and audio inputs.
+
+    Args:
+        multimodal_data: Either a string (text only) or a dictionary with
+                        'text', 'image', and optionally 'audio' keys.
+
+    Returns:
+        Tuple of (text, image_inputs, audio_inputs) where inputs are
+        formatted for LLM API consumption.
+    """
     if isinstance(multimodal_data, str):
         text = multimodal_data
-        image_inputs: list[dict[str, str]] = []
+        image_inputs: list[dict[str, Any]] = []
+        audio_inputs: list[dict[str, Any]] = []
     elif isinstance(multimodal_data, dict):
+        # Handle image data
         image_data: dict[str, str] = multimodal_data.get("image", {})
         _image_inputs: list[tuple[dict, dict]] = [
             (
@@ -59,25 +72,64 @@ def context_formatter(
             for key, base64_image in image_data.items()
         ]
         image_inputs = [m for image_input in _image_inputs for m in image_input]
+
+        # Handle audio data
+        audio_data: dict[str, str] = multimodal_data.get("audio", {})
+        _audio_inputs: list[tuple[dict, dict]] = [
+            (
+                {
+                    "type": "text",
+                    "text": f"[{key.capitalize()} Audio]: \n",
+                },
+                {
+                    "type": "input_audio",
+                    "input_audio": {
+                        "data": base64_audio.split(",")[1] if "," in base64_audio else base64_audio,
+                        "format": "wav",  # Default format, could be inferred from data URI
+                    },
+                },
+            )
+            for key, base64_audio in audio_data.items()
+            if base64_audio is not None
+        ]
+        audio_inputs = [m for audio_input in _audio_inputs for m in audio_input]
+
         text = multimodal_data["text"] or ""
     else:
         raise ValueError("multimodal_data must be a dictionary or a string")
-    return text, image_inputs
+    return text, image_inputs, audio_inputs
 
 
 def user_message_formatter(
     multimodal_data: dict[str, Any] | str,
     user_instruction_with_tag: str | None = None,
 ) -> dict[str, Any]:
-    text, image_inputs = context_formatter(multimodal_data)
-    if not image_inputs or len(image_inputs) == 0:
+    """
+    Format multimodal data into a user message for LLM APIs.
+
+    Args:
+        multimodal_data: Text string or dict with 'text', 'image', 'audio' keys.
+        user_instruction_with_tag: Optional instruction to append to the message.
+
+    Returns:
+        A dictionary representing the user message for the LLM API.
+    """
+    text, image_inputs, audio_inputs = context_formatter(multimodal_data)
+    has_media = (image_inputs and len(image_inputs) > 0) or (audio_inputs and len(audio_inputs) > 0)
+
+    if not has_media:
         return {
             "role": "user",
             "content": f"Context:\n{text}\n\n{user_instruction_with_tag}",
         }
-    content = [{"type": "text", "text": f"Context:\n{text}"}] + image_inputs
+
+    content: list[dict[str, Any]] = [{"type": "text", "text": f"Context:\n{text}"}]
+    content.extend(image_inputs)
+    content.extend(audio_inputs)
+
     if user_instruction_with_tag:
         content.append({"type": "text", "text": f"\n\n{user_instruction_with_tag}"})
+
     return {
         "role": "user",
         "content": content,
@@ -363,16 +415,29 @@ def clean_and_escape_column_name(column_name: str) -> str:
 
 def df2multimodal_info(df: pd.DataFrame, cols: list[str]) -> list[dict[str, Any]]:
     """
-    Formats the given DataFrame into a string containing info from cols.
-    Return a list of dictionaries, each containing text and image data.
+    Formats the given DataFrame into a list of multimodal info dictionaries.
+
+    Extracts text, image, and audio data from the specified columns based on
+    their dtypes. Image columns are identified by ImageDtype, audio columns
+    by AudioDtype, and all other columns are treated as text.
+
+    Args:
+        df: The DataFrame to format.
+        cols: List of column names to include in the output.
+
+    Returns:
+        A list of dictionaries, each containing 'text', 'image', and 'audio'
+        keys with the corresponding data for each row.
     """
     image_cols = [col for col in cols if isinstance(df[col].dtype, ImageDtype)]
-    text_cols = [col for col in cols if col not in image_cols]
+    audio_cols = [col for col in cols if isinstance(df[col].dtype, AudioDtype)]
+    text_cols = [col for col in cols if col not in image_cols and col not in audio_cols]
     text_rows = df2text(df, text_cols)
     multimodal_data = [
         {
             "text": text_rows[i],
             "image": {col.capitalize(): df[col].array.get_image(i, "base64") for col in image_cols},
+            "audio": {col.capitalize(): df[col].array.get_audio(i, "base64") for col in audio_cols},
         }
         for i in range(len(df))
     ]
@@ -395,7 +460,8 @@ def merge_multimodal_info(first: list[dict[str, Any]], second: list[dict[str, An
             "text": f"{first[i]['text']}\n{second[j]['text']}"
             if first[i]["text"] != "" and second[j]["text"] != ""
             else first[i]["text"] + second[j]["text"],
-            "image": {**first[i]["image"], **second[j]["image"]},
+            "image": {**first[i].get("image", {}), **second[j].get("image", {})},
+            "audio": {**first[i].get("audio", {}), **second[j].get("audio", {})},
         }
         for i in range(len(first))
         for j in range(len(second))
diff --git a/lotus/types.py b/lotus/types.py
index dc14c0f5..516ba92d 100644
--- a/lotus/types.py
+++ b/lotus/types.py
@@ -117,13 +117,44 @@ class SemanticFilterPostprocessOutput:
     explanations: list[str | None]
 
 
+@dataclass
+class RawOutputs:
+    predictions: list[list[bool]]
+    raw_outputs: list[list[str]]
+    explanations: list[list[str | None]]
+    logprobs: list[list[list[ChatCompletionTokenLogprob]]] | None = None
+
+
 @dataclass
 class SemanticFilterOutput:
-    raw_outputs: list[str]
     outputs: list[bool]
-    explanations: list[str | None]
+    _raw_outputs: RawOutputs
     stats: dict[str, Any] | None = None
-    logprobs: list[list[ChatCompletionTokenLogprob]] | None = None
+
+    @property
+    def raw_outputs(self) -> list[str]:
+        # Backward compatibility: flatten if single run, or return first run?
+        # Feedback says: "check if the lists only have 1 item and return that item"
+        if len(self._raw_outputs.raw_outputs) > 0 and len(self._raw_outputs.raw_outputs[0]) == 1:
+            return [runs[0] for runs in self._raw_outputs.raw_outputs]
+        # Fallback if multiple runs: return just the first run's data or handle differently?
+        # For now, let's return the simplified single-run view required by existing tests
+        return [runs[0] for runs in self._raw_outputs.raw_outputs]
+
+    @property
+    def explanations(self) -> list[str | None]:
+        if len(self._raw_outputs.explanations) > 0 and len(self._raw_outputs.explanations[0]) == 1:
+            return [runs[0] for runs in self._raw_outputs.explanations]
+        return [runs[0] for runs in self._raw_outputs.explanations]
+
+    @property
+    def logprobs(self) -> list[list[ChatCompletionTokenLogprob]] | None:
+        if self._raw_outputs.logprobs:
+            if len(self._raw_outputs.logprobs) > 0 and len(self._raw_outputs.logprobs[0]) == 1:
+                return [runs[0] for runs in self._raw_outputs.logprobs]
+            return [runs[0] for runs in self._raw_outputs.logprobs]
+        return None
+
 
 
 @dataclass
diff --git a/tests/test_audio_array.py b/tests/test_audio_array.py
new file mode 100644
index 00000000..318bd1c8
--- /dev/null
+++ b/tests/test_audio_array.py
@@ -0,0 +1,300 @@
+"""
+Tests for the AudioArray extension.
+
+This module contains comprehensive tests for the AudioDtype and AudioArray
+classes that enable audio data processing in LOTUS.
+"""
+
+import base64
+
+import numpy as np
+import pandas as pd
+
+from lotus.dtype_extensions.audio import (
+    SUPPORTED_AUDIO_FORMATS,
+    AudioArray,
+    AudioDtype,
+)
+
+
+class TestAudioDtype:
+    """Tests for the AudioDtype class."""
+
+    def test_dtype_name(self):
+        """Should have correct name."""
+        dtype = AudioDtype()
+        assert dtype.name == "audio"
+
+    def test_dtype_type(self):
+        """Should have bytes as scalar type."""
+        dtype = AudioDtype()
+        assert dtype.type is bytes
+
+    def test_na_value(self):
+        """Should have None as na_value."""
+        dtype = AudioDtype()
+        assert dtype.na_value is None
+
+    def test_construct_array_type(self):
+        """Should return AudioArray type."""
+        assert AudioDtype.construct_array_type() == AudioArray
+
+
+class TestAudioArrayBasics:
+    """Basic tests for AudioArray initialization and properties."""
+
+    def test_initialization_with_paths(self):
+        """Should initialize with file paths."""
+        paths = ["audio1.wav", "audio2.mp3", "audio3.flac"]
+        arr = AudioArray(paths)
+        assert len(arr) == 3
+
+    def test_initialization_with_none_values(self):
+        """Should handle None values."""
+        values = ["audio1.wav", None, "audio3.mp3"]
+        arr = AudioArray(values)
+        assert len(arr) == 3
+
+    def test_dtype_property(self):
+        """Should return AudioDtype instance."""
+        arr = AudioArray(["test.wav"])
+        assert isinstance(arr.dtype, AudioDtype)
+
+    def test_empty_array(self):
+        """Should handle empty arrays."""
+        arr = AudioArray([])
+        assert len(arr) == 0
+
+
+class TestAudioArrayIndexing:
+    """Tests for AudioArray indexing operations."""
+
+    def test_single_item_access(self):
+        """Should return single item for integer index."""
+        paths = ["a.wav", "b.mp3", "c.flac"]
+        arr = AudioArray(paths)
+        assert arr[0] == "a.wav"
+        assert arr[2] == "c.flac"
+
+    def test_slice_access(self):
+        """Should return AudioArray for slice."""
+        paths = ["a.wav", "b.mp3", "c.flac", "d.ogg"]
+        arr = AudioArray(paths)
+        result = arr[1:3]
+        assert isinstance(result, AudioArray)
+        assert len(result) == 2
+
+    def test_list_index_access(self):
+        """Should return AudioArray for list of indices."""
+        paths = ["a.wav", "b.mp3", "c.flac"]
+        arr = AudioArray(paths)
+        result = arr[[0, 2]]
+        assert isinstance(result, AudioArray)
+        assert len(result) == 2
+
+    def test_setitem_single(self):
+        """Should update single item."""
+        arr = AudioArray(["a.wav", "b.mp3"])
+        arr[0] = "new.wav"
+        assert arr[0] == "new.wav"
+
+    def test_setitem_slice(self):
+        """Should update slice of items."""
+        arr = AudioArray(["a.wav", "b.mp3", "c.flac"])
+        arr[0:2] = ["x.wav", "y.mp3"]
+        assert arr[0] == "x.wav"
+        assert arr[1] == "y.mp3"
+
+
+class TestAudioArrayMethods:
+    """Tests for AudioArray methods."""
+
+    def test_isna(self):
+        """Should detect missing values."""
+        arr = AudioArray(["a.wav", None, "c.flac"])
+        result = arr.isna()
+        assert result[0] is np.False_
+        assert result[1] is np.True_
+        assert result[2] is np.False_
+
+    def test_take(self):
+        """Should take elements by index."""
+        arr = AudioArray(["a.wav", "b.mp3", "c.flac"])
+        result = arr.take([2, 0])
+        assert len(result) == 2
+        assert result[0] == "c.flac"
+        assert result[1] == "a.wav"
+
+    def test_copy(self):
+        """Should create a copy of the array."""
+        arr = AudioArray(["a.wav", "b.mp3"])
+        copy = arr.copy()
+        assert len(copy) == len(arr)
+        copy[0] = "modified.wav"
+        assert arr[0] == "a.wav"  # Original unchanged
+
+    def test_concat_same_type(self):
+        """Should concatenate multiple AudioArrays."""
+        arr1 = AudioArray(["a.wav", "b.mp3"])
+        arr2 = AudioArray(["c.flac", "d.ogg"])
+        result = AudioArray._concat_same_type([arr1, arr2])
+        assert len(result) == 4
+
+    def test_from_sequence(self):
+        """Should construct from sequence."""
+        paths = ["a.wav", "b.mp3"]
+        result = AudioArray._from_sequence(paths)
+        assert isinstance(result, AudioArray)
+        assert len(result) == 2
+
+    def test_to_numpy(self):
+        """Should convert to numpy array."""
+        arr = AudioArray(["a.wav", "b.mp3"])
+        result = arr.to_numpy()
+        assert isinstance(result, np.ndarray)
+        assert len(result) == 2
+
+
+class TestAudioArrayEquality:
+    """Tests for AudioArray equality comparison."""
+
+    def test_equality_with_audioarray(self):
+        """Should compare with another AudioArray."""
+        arr1 = AudioArray(["a.wav", "b.mp3"])
+        arr2 = AudioArray(["a.wav", "c.flac"])
+        result = arr1 == arr2
+        assert result[0] is np.True_
+        assert result[1] is np.False_
+
+    def test_equality_with_list(self):
+        """Should compare with list."""
+        arr = AudioArray(["a.wav", "b.mp3"])
+        result = arr == ["a.wav", "x.ogg"]
+        assert result[0] is np.True_
+        assert result[1] is np.False_
+
+    def test_equality_with_scalar(self):
+        """Should compare with scalar."""
+        arr = AudioArray(["a.wav", "a.wav", "b.mp3"])
+        result = arr == "a.wav"
+        assert result[0] is np.True_
+        assert result[1] is np.True_
+        assert result[2] is np.False_
+
+
+class TestAudioArrayRepr:
+    """Tests for AudioArray string representation."""
+
+    def test_repr_short_array(self):
+        """Should show all elements for short arrays."""
+        arr = AudioArray(["a.wav", "b.mp3"])
+        repr_str = repr(arr)
+        assert "AudioArray" in repr_str
+        assert "a.wav" in repr_str
+
+    def test_repr_with_none(self):
+        """Should handle None values in repr."""
+        arr = AudioArray(["a.wav", None])
+        repr_str = repr(arr)
+        assert "None" in repr_str
+
+    def test_formatter(self):
+        """Should return formatter function."""
+        arr = AudioArray(["a.wav"])
+        formatter = arr._formatter()
+        assert callable(formatter)
+
+
+class TestAudioArrayMimeType:
+    """Tests for MIME type detection."""
+
+    def test_wav_mime_type(self):
+        """Should detect WAV MIME type."""
+        arr = AudioArray(["test.wav"])
+        assert arr.get_mime_type(0) == "audio/wav"
+
+    def test_mp3_mime_type(self):
+        """Should detect MP3 MIME type."""
+        arr = AudioArray(["test.mp3"])
+        assert arr.get_mime_type(0) == "audio/mpeg"
+
+    def test_flac_mime_type(self):
+        """Should detect FLAC MIME type."""
+        arr = AudioArray(["test.flac"])
+        assert arr.get_mime_type(0) == "audio/flac"
+
+    def test_ogg_mime_type(self):
+        """Should detect OGG MIME type."""
+        arr = AudioArray(["test.ogg"])
+        assert arr.get_mime_type(0) == "audio/ogg"
+
+    def test_none_value_mime_type(self):
+        """Should return None for None values."""
+        arr = AudioArray([None])
+        assert arr.get_mime_type(0) is None
+
+
+class TestSupportedFormats:
+    """Tests for supported audio formats."""
+
+    def test_wav_supported(self):
+        """WAV should be supported."""
+        assert ".wav" in SUPPORTED_AUDIO_FORMATS
+
+    def test_mp3_supported(self):
+        """MP3 should be supported."""
+        assert ".mp3" in SUPPORTED_AUDIO_FORMATS
+
+    def test_flac_supported(self):
+        """FLAC should be supported."""
+        assert ".flac" in SUPPORTED_AUDIO_FORMATS
+
+    def test_ogg_supported(self):
+        """OGG should be supported."""
+        assert ".ogg" in SUPPORTED_AUDIO_FORMATS
+
+    def test_mp4_supported(self):
+        """MP4 should be supported."""
+        assert ".mp4" in SUPPORTED_AUDIO_FORMATS
+
+
+class TestAudioArrayWithRealFiles:
+    """Tests with real audio files (when available)."""
+
+    def test_load_from_bytes(self):
+        """Should handle raw bytes input."""
+        audio_bytes = b"RIFF\x00\x00\x00\x00WAVEfmt "  # Minimal WAV header
+        arr = AudioArray([audio_bytes])
+        assert len(arr) == 1
+
+    def test_base64_data_uri(self):
+        """Should handle base64 data URIs."""
+        audio_bytes = b"test audio data"
+        encoded = base64.b64encode(audio_bytes).decode("utf-8")
+        data_uri = f"data:audio/wav;base64,{encoded}"
+        
+        arr = AudioArray([data_uri])
+        mime_type = arr.get_mime_type(0)
+        assert mime_type == "audio/wav"
+
+
+class TestPandasIntegration:
+    """Tests for pandas DataFrame integration."""
+
+    def test_series_with_audio_dtype(self):
+        """Should work in pandas Series."""
+        arr = AudioArray(["a.wav", "b.mp3", "c.flac"])
+        series = pd.Series(arr)
+        assert len(series) == 3
+
+    def test_dataframe_column(self):
+        """Should work as DataFrame column."""
+        arr = AudioArray(["a.wav", "b.mp3"])
+        df = pd.DataFrame({"audio": arr, "label": ["speech", "music"]})
+        assert len(df) == 2
+        assert "audio" in df.columns
+
+    def test_nbytes(self):
+        """Should calculate memory usage."""
+        arr = AudioArray(["a.wav", "b.mp3"])
+        assert arr.nbytes > 0
diff --git a/tests/test_ensembling.py b/tests/test_ensembling.py
new file mode 100644
index 00000000..44c8c359
--- /dev/null
+++ b/tests/test_ensembling.py
@@ -0,0 +1,262 @@
+"""
+Tests for the ensembling module.
+
+This module contains comprehensive tests for all ensembling strategies
+used in test-time scaling for semantic operations.
+"""
+
+import pytest
+
+from lotus.sem_ops.ensembling import (
+    Ensemble,
+    EnsembleConfig,
+    EnsembleStrategy,
+    confidence_threshold,
+    consensus,
+    majority_vote,
+    weighted_average,
+)
+
+
+class TestMajorityVote:
+    """Tests for the majority_vote function."""
+
+    def test_basic_boolean_majority(self):
+        """Should return the most common boolean value."""
+        samples = [True, True, False]
+        assert majority_vote(samples) is True
+
+    def test_all_same_value(self):
+        """Should return the unanimous value."""
+        samples = [True, True, True]
+        assert majority_vote(samples) is True
+
+    def test_string_values(self):
+        """Should work with string values."""
+        samples = ["cat", "dog", "cat", "bird", "cat"]
+        assert majority_vote(samples) == "cat"
+
+    def test_tie_returns_first_most_common(self):
+        """In case of tie, should return deterministic result."""
+        samples = [True, False]
+        result = majority_vote(samples)
+        assert result in [True, False]
+
+    def test_single_element(self):
+        """Should handle single element lists."""
+        assert majority_vote([True]) is True
+        assert majority_vote(["value"]) == "value"
+
+    def test_empty_list_raises_error(self):
+        """Should raise ValueError for empty list."""
+        with pytest.raises(ValueError, match="Cannot compute majority vote"):
+            majority_vote([])
+
+
+class TestWeightedAverage:
+    """Tests for the weighted_average function."""
+
+    def test_uniform_weights(self):
+        """With uniform weights, should behave like majority vote."""
+        samples = [True, True, False]
+        assert weighted_average(samples) is True
+
+    def test_weighted_towards_true(self):
+        """Higher weights on True should return True."""
+        samples = [True, False, False]
+        weights = [0.9, 0.1, 0.1]
+        assert weighted_average(samples, weights) is True
+
+    def test_weighted_towards_false(self):
+        """Higher weights on False should return False."""
+        samples = [True, True, False]
+        weights = [0.1, 0.1, 0.9]
+        assert weighted_average(samples, weights) is False
+
+    def test_no_weights_uses_uniform(self):
+        """If weights are None, should use uniform weights."""
+        samples = [True, True, False, False, True]
+        assert weighted_average(samples) is True
+
+    def test_mismatched_lengths_raises_error(self):
+        """Should raise ValueError if lengths don't match."""
+        with pytest.raises(ValueError, match="same length"):
+            weighted_average([True, False], [0.5])
+
+    def test_empty_list_raises_error(self):
+        """Should raise ValueError for empty list."""
+        with pytest.raises(ValueError, match="Cannot compute"):
+            weighted_average([])
+
+    def test_zero_total_weight_raises_error(self):
+        """Should raise ValueError if total weight is zero."""
+        with pytest.raises(ValueError, match="cannot be zero"):
+            weighted_average([True, False], [0.0, 0.0])
+
+
+class TestConsensus:
+    """Tests for the consensus function."""
+
+    def test_unanimous_true(self):
+        """Should return True when all samples are True."""
+        samples = [True, True, True]
+        assert consensus(samples) is True
+
+    def test_unanimous_false(self):
+        """Should return False when all samples are False."""
+        samples = [False, False, False]
+        assert consensus(samples) is False
+
+    def test_no_consensus_returns_default(self):
+        """Should return default when samples disagree."""
+        samples = [True, True, False]
+        assert consensus(samples, default=None) is None
+
+    def test_custom_default_value(self):
+        """Should return custom default when specified."""
+        samples = [True, False]
+        assert consensus(samples, default="inconclusive") == "inconclusive"
+
+    def test_string_consensus(self):
+        """Should work with string values."""
+        samples = ["yes", "yes", "yes"]
+        assert consensus(samples) == "yes"
+
+    def test_single_element_consensus(self):
+        """Single element should always have consensus."""
+        assert consensus([True]) is True
+        assert consensus(["value"]) == "value"
+
+    def test_empty_list_raises_error(self):
+        """Should raise ValueError for empty list."""
+        with pytest.raises(ValueError, match="Cannot compute consensus"):
+            consensus([])
+
+
+class TestConfidenceThreshold:
+    """Tests for the confidence_threshold function."""
+
+    def test_high_confidence(self):
+        """Should return high confidence for unanimous samples."""
+        samples = [True, True, True]
+        result, confidence = confidence_threshold(samples)
+        assert result is True
+        assert confidence == 1.0
+
+    def test_moderate_confidence(self):
+        """Should calculate correct confidence for mixed samples."""
+        samples = [True, True, False]
+        result, confidence = confidence_threshold(samples)
+        assert result is True
+        assert abs(confidence - 0.667) < 0.01
+
+    def test_low_confidence(self):
+        """Should handle low confidence scenarios."""
+        samples = [True, False]
+        result, confidence = confidence_threshold(samples)
+        assert confidence == 0.5
+
+    def test_empty_list_raises_error(self):
+        """Should raise ValueError for empty list."""
+        with pytest.raises(ValueError, match="Cannot compute confidence"):
+            confidence_threshold([])
+
+
+class TestEnsembleConfig:
+    """Tests for EnsembleConfig dataclass."""
+
+    def test_default_values(self):
+        """Should have sensible default values."""
+        config = EnsembleConfig()
+        assert config.n_samples == 3
+        assert config.strategy == EnsembleStrategy.MAJORITY_VOTE
+        assert config.temperature == 1.0
+        assert config.confidence_threshold == 0.6
+
+    def test_custom_values(self):
+        """Should accept custom values."""
+        config = EnsembleConfig(
+            n_samples=5,
+            strategy=EnsembleStrategy.CONSENSUS,
+            temperature=0.8,
+            confidence_threshold=0.75
+        )
+        assert config.n_samples == 5
+        assert config.strategy == EnsembleStrategy.CONSENSUS
+
+
+class TestEnsemble:
+    """Tests for the Ensemble class."""
+
+    def test_default_initialization(self):
+        """Should initialize with default config."""
+        ensemble = Ensemble()
+        assert ensemble.config.strategy == EnsembleStrategy.MAJORITY_VOTE
+
+    def test_custom_config(self):
+        """Should accept custom configuration."""
+        config = EnsembleConfig(strategy=EnsembleStrategy.CONSENSUS)
+        ensemble = Ensemble(config)
+        assert ensemble.config.strategy == EnsembleStrategy.CONSENSUS
+
+    def test_aggregate_majority_vote(self):
+        """Should aggregate using majority vote."""
+        ensemble = Ensemble(EnsembleConfig(strategy=EnsembleStrategy.MAJORITY_VOTE))
+        result = ensemble.aggregate([True, True, False])
+        assert result is True
+
+    def test_aggregate_weighted_average(self):
+        """Should aggregate using weighted average for booleans."""
+        ensemble = Ensemble(EnsembleConfig(strategy=EnsembleStrategy.WEIGHTED_AVERAGE))
+        result = ensemble.aggregate([True, False, False], weights=[0.9, 0.1, 0.1])
+        assert result is True
+
+    def test_aggregate_consensus(self):
+        """Should aggregate using consensus."""
+        ensemble = Ensemble(EnsembleConfig(strategy=EnsembleStrategy.CONSENSUS))
+        
+        # Unanimous case
+        result = ensemble.aggregate([True, True, True])
+        assert result is True
+        
+        # No consensus case
+        result = ensemble.aggregate([True, True, False], default=None)
+        assert result is None
+
+    def test_aggregate_confidence_threshold(self):
+        """Should aggregate using confidence threshold."""
+        config = EnsembleConfig(
+            strategy=EnsembleStrategy.CONFIDENCE_THRESHOLD,
+            confidence_threshold=0.6
+        )
+        ensemble = Ensemble(config)
+        result = ensemble.aggregate([True, True, False])
+        assert result is True
+
+    def test_aggregate_batch(self):
+        """Should aggregate multiple inputs in batch."""
+        ensemble = Ensemble()
+        batch = [
+            [True, True, False],
+            [False, False, True],
+            [True, True, True]
+        ]
+        results = ensemble.aggregate_batch(batch)
+        assert results == [True, False, True]
+
+    def test_weighted_average_fallback_for_non_boolean(self):
+        """Weighted average should fall back to majority vote for non-booleans."""
+        ensemble = Ensemble(EnsembleConfig(strategy=EnsembleStrategy.WEIGHTED_AVERAGE))
+        result = ensemble.aggregate(["cat", "cat", "dog"])
+        assert result == "cat"
+
+
+class TestEnsembleStrategy:
+    """Tests for EnsembleStrategy enum."""
+
+    def test_strategy_values(self):
+        """Should have correct string values."""
+        assert EnsembleStrategy.MAJORITY_VOTE.value == "majority_vote"
+        assert EnsembleStrategy.WEIGHTED_AVERAGE.value == "weighted_average"
+        assert EnsembleStrategy.CONSENSUS.value == "consensus"
+        assert EnsembleStrategy.CONFIDENCE_THRESHOLD.value == "confidence_threshold"
diff --git a/tests/test_sem_filter_ensembling.py b/tests/test_sem_filter_ensembling.py
new file mode 100644
index 00000000..7fe82c18
--- /dev/null
+++ b/tests/test_sem_filter_ensembling.py
@@ -0,0 +1,164 @@
+"""
+Tests for sem_filter ensembling integration.
+
+Tests the integration of the Ensemble class with sem_filter operator,
+including n_sample parameter, different strategies, and output format.
+"""
+
+
+import pytest
+
+from lotus.sem_ops.ensembling import (
+    Ensemble,
+    EnsembleConfig,
+    EnsembleStrategy,
+)
+from lotus.types import RawOutputs, SemanticFilterOutput
+
+
+class TestSemanticFilterOutputProperties:
+    """Test SemanticFilterOutput backward compatibility properties."""
+    
+    def test_raw_outputs_single_sample(self):
+        """Test raw_outputs property returns single items when n_sample=1."""
+        raw = RawOutputs(
+            predictions=[[True], [False], [True]],
+            raw_outputs=[["yes"], ["no"], ["yes"]],
+            explanations=[["reason1"], ["reason2"], ["reason3"]],
+            logprobs=None,
+        )
+        output = SemanticFilterOutput(
+            outputs=[True, False, True],
+            _raw_outputs=raw,
+        )
+        
+        assert output.raw_outputs == ["yes", "no", "yes"]
+        assert output.explanations == ["reason1", "reason2", "reason3"]
+        assert output.logprobs is None
+    
+    def test_raw_outputs_multiple_samples(self):
+        """Test raw_outputs property with multiple samples (returns first)."""
+        raw = RawOutputs(
+            predictions=[[True, True, False], [False, False, True]],
+            raw_outputs=[["yes", "yes", "no"], ["no", "no", "yes"]],
+            explanations=[["r1", "r2", "r3"], ["r4", "r5", "r6"]],
+            logprobs=None,
+        )
+        output = SemanticFilterOutput(
+            outputs=[True, False],  # Aggregated results
+            _raw_outputs=raw,
+        )
+        
+        # Should return first run's data for backward compatibility
+        assert output.raw_outputs == ["yes", "no"]
+        assert output.explanations == ["r1", "r4"]
+
+
+class TestEnsembleConfigIntegration:
+    """Test EnsembleConfig usage in real scenarios."""
+    
+    def test_default_config(self):
+        """Test default EnsembleConfig values."""
+        config = EnsembleConfig()
+        assert config.strategy == EnsembleStrategy.MAJORITY_VOTE
+        assert config.weights is None
+        assert config.default is None
+        assert config.confidence_threshold == 0.6
+    
+    def test_config_with_weights(self):
+        """Test EnsembleConfig with weighted average settings."""
+        config = EnsembleConfig(
+            strategy=EnsembleStrategy.WEIGHTED_AVERAGE,
+            weights=[0.5, 0.3, 0.2],
+        )
+        ensemble = Ensemble(config)
+        
+        # Test aggregation uses config weights
+        result = ensemble.aggregate([True, True, False])
+        assert result is True  # 0.5 + 0.3 = 0.8 > 0.5 threshold
+    
+    def test_config_with_consensus_default(self):
+        """Test EnsembleConfig with consensus and default value."""
+        config = EnsembleConfig(
+            strategy=EnsembleStrategy.CONSENSUS,
+            default=False,
+        )
+        ensemble = Ensemble(config)
+        
+        # No consensus -> should return default
+        result = ensemble.aggregate([True, False, True])
+        assert result is False
+        
+        # Consensus -> should return the agreed value
+        result = ensemble.aggregate([True, True, True])
+        assert result is True
+
+
+class TestEnsembleBatchAggregation:
+    """Test batch aggregation for sem_filter integration."""
+    
+    def test_batch_majority_vote(self):
+        """Test batch aggregation with majority vote."""
+        config = EnsembleConfig(strategy=EnsembleStrategy.MAJORITY_VOTE)
+        ensemble = Ensemble(config)
+        
+        # Simulate outputs from 3 samples for 4 documents
+        batch_samples = [
+            [True, True, False],   # Doc 0: 2/3 True -> True
+            [False, False, False], # Doc 1: 0/3 True -> False
+            [True, False, True],   # Doc 2: 2/3 True -> True
+            [False, True, False],  # Doc 3: 1/3 True -> False
+        ]
+        
+        results = ensemble.aggregate_batch(batch_samples)
+        assert results == [True, False, True, False]
+    
+    def test_batch_consensus(self):
+        """Test batch aggregation with consensus strategy."""
+        config = EnsembleConfig(
+            strategy=EnsembleStrategy.CONSENSUS,
+            default=None,
+        )
+        ensemble = Ensemble(config)
+        
+        batch_samples = [
+            [True, True, True],    # Doc 0: unanimous True
+            [False, False, True],  # Doc 1: no consensus -> None
+            [False, False, False], # Doc 2: unanimous False
+        ]
+        
+        results = ensemble.aggregate_batch(batch_samples)
+        assert results == [True, None, False]
+
+
+class TestRawOutputsStructure:
+    """Test RawOutputs dataclass structure."""
+    
+    def test_raw_outputs_creation(self):
+        """Test creating RawOutputs with all fields."""
+        raw = RawOutputs(
+            predictions=[[True, False], [False, True]],
+            raw_outputs=[["yes", "no"], ["no", "yes"]],
+            explanations=[["r1", "r2"], ["r3", "r4"]],
+            logprobs=[[[{"token": "yes", "logprob": -0.1}]], [[{"token": "no", "logprob": -0.2}]]],
+        )
+        
+        assert len(raw.predictions) == 2
+        assert len(raw.predictions[0]) == 2
+        assert raw.raw_outputs[0][0] == "yes"
+        assert raw.explanations[1][1] == "r4"
+    
+    def test_raw_outputs_without_logprobs(self):
+        """Test RawOutputs with None logprobs."""
+        raw = RawOutputs(
+            predictions=[[True]],
+            raw_outputs=[["yes"]],
+            explanations=[["reason"]],
+            logprobs=None,
+        )
+        
+        assert raw.logprobs is None
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])