lotus-data · liana313 · Apr 30, 2026 · Apr 15, 2026 · Apr 15, 2026 · Apr 24, 2026
diff --git a/.github/tests/multimodality_tests.py b/.github/tests/multimodality_tests.py
@@ -70,7 +70,7 @@ def test_filter_operation(setup_models, model):
         "https://thumbs.dreamstime.com/b/comida-r%C3%A1pida-nachos-con-el-sause-del-tomate-ejemplo-exhausto-de-la-acuarela-mano-aislado-en-blanco-150936354.jpg",
         "https://i1.wp.com/www.alloverthemap.net/wp-content/uploads/2014/02/2012-09-25-12.46.15.jpg?resize=400%2C284&amp;ssl=1",
         "https://i.pinimg.com/236x/a4/3a/65/a43a65683a0314f29b66402cebdcf46d.jpg",
-        "https://pravme.ru/wp-content/uploads/2018/01/sobor-Bogord-1.jpg",
+        "https://images.pexels.com/photos/45201/kitty-cat-kitten-pet-45201.jpeg?auto=compress&cs=tinysrgb&w=600",
     ]
     df = pd.DataFrame({"image": ImageArray(image_url)})
     user_instruction = "{image} represents food"
@@ -96,7 +96,7 @@ def test_join_operation(setup_models, model):
         "https://img.etsystatic.com/il/4bee20/1469037676/il_340x270.1469037676_iiti.jpg?version=0",
         "https://i1.wp.com/www.alloverthemap.net/wp-content/uploads/2014/02/2012-09-25-12.46.15.jpg?resize=400%2C284&amp;ssl=1",
         "https://i.pinimg.com/236x/a4/3a/65/a43a65683a0314f29b66402cebdcf46d.jpg",
-        "https://pravme.ru/wp-content/uploads/2018/01/sobor-Bogord-1.jpg",
+        "https://images.pexels.com/photos/45201/kitty-cat-kitten-pet-45201.jpeg?auto=compress&cs=tinysrgb&w=600",
     ]
     elements = ["doll", "bird"]
     image_df = pd.DataFrame({"image": ImageArray(image_url)})
@@ -123,14 +123,14 @@ def test_topk_operation(setup_models, model):
         "https://thumbs.dreamstime.com/b/comida-r%C3%A1pida-nachos-con-el-sause-del-tomate-ejemplo-exhausto-de-la-acuarela-mano-aislado-en-blanco-150936354.jpg",
         "https://i1.wp.com/www.alloverthemap.net/wp-content/uploads/2014/02/2012-09-25-12.46.15.jpg?resize=400%2C284&amp;ssl=1",
         "https://i.pinimg.com/236x/a4/3a/65/a43a65683a0314f29b66402cebdcf46d.jpg",
-        "https://pravme.ru/wp-content/uploads/2018/01/sobor-Bogord-1.jpg",
+        "https://images.pexels.com/photos/45201/kitty-cat-kitten-pet-45201.jpeg?auto=compress&cs=tinysrgb&w=600",
     ]
     df = pd.DataFrame({"image": ImageArray(image_url)})
     user_instruction = "{image} represents living beings"
     top_2_expected = set(
         [
             "https://i.pinimg.com/236x/a4/3a/65/a43a65683a0314f29b66402cebdcf46d.jpg",
-            "https://pravme.ru/wp-content/uploads/2018/01/sobor-Bogord-1.jpg",
+            "https://images.pexels.com/photos/45201/kitty-cat-kitten-pet-45201.jpeg?auto=compress&cs=tinysrgb&w=600",
         ]
     )
 
@@ -148,7 +148,7 @@ def test_topk_with_groupby_operation(setup_models, model):
         "https://img.etsystatic.com/il/4bee20/1469037676/il_340x270.1469037676_iiti.jpg?version=0",
         "https://i1.wp.com/www.alloverthemap.net/wp-content/uploads/2014/02/2012-09-25-12.46.15.jpg?resize=400%2C284&amp;ssl=1",
         "https://i.pinimg.com/236x/a4/3a/65/a43a65683a0314f29b66402cebdcf46d.jpg",
-        "https://pravme.ru/wp-content/uploads/2018/01/sobor-Bogord-1.jpg",
+        "https://images.pexels.com/photos/45201/kitty-cat-kitten-pet-45201.jpeg?auto=compress&cs=tinysrgb&w=600",
     ]
     elements = ["doll", "bird"]
     image_df = pd.DataFrame({"image": ImageArray(image_url)})
@@ -169,7 +169,7 @@ def test_search_operation(setup_models, model):
         "https://img.etsystatic.com/il/4bee20/1469037676/il_340x270.1469037676_iiti.jpg?version=0",
         "https://i1.wp.com/www.alloverthemap.net/wp-content/uploads/2014/02/2012-09-25-12.46.15.jpg?resize=400%2C284&amp;ssl=1",
         "https://i.pinimg.com/236x/a4/3a/65/a43a65683a0314f29b66402cebdcf46d.jpg",
-        "https://pravme.ru/wp-content/uploads/2018/01/sobor-Bogord-1.jpg",
+        "https://images.pexels.com/photos/45201/kitty-cat-kitten-pet-45201.jpeg?auto=compress&cs=tinysrgb&w=600",
     ]
 
     expected_result = set(["https://i.pinimg.com/236x/a4/3a/65/a43a65683a0314f29b66402cebdcf46d.jpg"])
@@ -190,7 +190,7 @@ def test_sim_join_operation_image_index(setup_models, model):
         "https://img.etsystatic.com/il/4bee20/1469037676/il_340x270.1469037676_iiti.jpg?version=0",
         "https://i1.wp.com/www.alloverthemap.net/wp-content/uploads/2014/02/2012-09-25-12.46.15.jpg?resize=400%2C284&amp;ssl=1",
         "https://i.pinimg.com/236x/a4/3a/65/a43a65683a0314f29b66402cebdcf46d.jpg",
-        "https://pravme.ru/wp-content/uploads/2018/01/sobor-Bogord-1.jpg",
+        "https://images.pexels.com/photos/45201/kitty-cat-kitten-pet-45201.jpeg?auto=compress&cs=tinysrgb&w=600",
     ]
     elements = ["doll", "bird"]
 

diff --git a/docs/DirectoryReader.rst b/docs/DirectoryReader.rst
@@ -1,10 +1,10 @@
 File Loading with DirectoryReader
-========================
+=================================
 
 Overview
 ---------
 The `DirectoryReader` class provides an enhanced, flexible way to ingest and process various document types, including local files, directories, and URLs. 
-It supports incremental file addition, automatic type detection, URL downloads, and efficient metadata handling, making it seemless to integrate files with LOTUS.
+It supports incremental file addition, automatic type detection, URL downloads, and efficient metadata handling, making it seamless to integrate files with LOTUS.
 
 Supported File Types
 --------------------
@@ -15,15 +15,15 @@ Supported File Types
 - Word files (DOCX, DOC): `per_page` mode is not supported for such files.
 - Text-based files (`.txt`, `.py`, `.md`, etc.): `per_page` mode is not supported for such files.
 
-Intstallation
---------
+Installation
+------------
 To get started, you will need to install the lotus submodule as follows::
 
     pip install lotus-ai[file_extractor]
 
 
 PDF Example
---------
+-----------
 .. code-block:: python
 
     import pathlib
@@ -46,7 +46,7 @@ PDF Example
     print(top_motivating_poems["content"].values[0])
 
 Remote PDF Example
---------
+------------------
 You can directly download PDFs from URLs and process them seamlessly:
 
 .. code-block:: python
@@ -62,7 +62,7 @@ You can directly download PDFs from URLs and process them seamlessly:
     print(f"Loaded PDFs:\n{df[['file_path', 'content']]}")
 
 PowerPoint (PPT) Example
---------
+------------------------
 The `DirectoryReader` class also supports PPT files, downloading and extracting each slide's content into a structured format:
 
 .. code-block:: python
@@ -76,7 +76,7 @@ The `DirectoryReader` class also supports PPT files, downloading and extracting
 
 Chunking
 --------
-You aslo have the option to chunk the documents. This is useful when you have a large document and you want to process it in smaller chunks.
+You also have the option to chunk the documents. This is useful when you have a large document and you want to process it in smaller chunks.
 You can specify the chunk size and the overlap between the chunks or use the default values of 1000 and 50 respectively.
 
 .. code-block:: python
@@ -90,7 +90,7 @@ You can specify the chunk size and the overlap between the chunks or use the def
 
 
 Optional Parameters for initializing DirectoryReader
---------------------------------
+----------------------------------------------------
 - **recursive (bool)**: Whether to recursively search subdirectories. Default is `False`.
 - **custom_reader_configs (dict)**: Configuration for custom file readers based on file extensions. Currently supports PPT, PPTX and PPTM
 - **exclude (List[str])**: Patterns of files to exclude.
@@ -130,12 +130,11 @@ Available Methods
 
 
 Integration with LOTUS Semantic Operators
---------------------
+-----------------------------------------
 Once you've loaded your data files, you can proceed to seamlessly use LOTUS' semantic operators!
 
 .. code-block:: python
 
     filtered_df = df.sem_filter(user_instruction="Filter instruction here", cascade_args=cascade_args)
     ranked_df = filtered_df.sem_topk("Ranking instruction here", K=3)
     print(f"Top Ranked Results:\n{ranked_df[['content']]}")
-
diff --git a/docs/approximation_cascades.rst b/docs/approximation_cascades.rst
@@ -1,5 +1,5 @@
 Optimized Processing with Approximations
-=======================
+========================================
 
 Overview
 ---------------
@@ -26,45 +26,148 @@ lotus's configuration settings
 
 .. code-block:: python
 
-   import lotus
-   from lotus.models import LM
-   from lotus.types import CascadeArgs
-
-
-   gpt_4o_mini = LM("gpt-4o-mini")
-   gpt_4o = LM("gpt-4o")
-
-   lotus.settings.configure(lm=gpt_4o, helper_lm=gpt_4o_mini)
-
-
-Once the LMs are set up, specify the cascade parameters-like recall and precision targets, sampling percentage, and 
-the acceptable failure probability-using the CascadeArgs object. 
+    import lotus
+    from lotus.models import LM
+    from lotus.types import CascadeArgs, ProxyModel
+
+    lotus.settings.configure(
+        lm=LM(model="gpt-4o"),
+        helper_lm=LM(model="gpt-4o-mini"),
+    )
+
+    cascade_args = CascadeArgs(
+        recall_target=0.9,
+        precision_target=0.9,
+        sampling_percentage=0.5,
+        failure_probability=0.2,
+        proxy_model=ProxyModel.HELPER_LM,
+    )
+
+    filtered, stats = df.sem_filter(
+        user_instruction="{Course Name} requires a lot of math",
+        cascade_args=cascade_args,
+        return_stats=True,
+    )
+
+CascadeArgs Parameters
+----------------------
+
+Accuracy Targets
+~~~~~~~~~~~~~~~~
+
+These fields describe the quality/cost tradeoff you want LOTUS to target when
+it learns thresholds.
+
+- ``recall_target``: Target recall for the cascade. Increase this when missing
+  true positives is costly. Default: ``0.8``.
+- ``precision_target``: Target precision for the cascade. Increase this when
+  false positives are costly. Default: ``0.8``.
+- ``failure_probability``: Allowed probability that the learned thresholds do
+  not meet the requested targets. Lower values are more conservative. Default:
+  ``0.2``.
+
+Sampling and Calibration
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+These fields control how LOTUS samples rows or pairs while learning
+thresholds.
+
+- ``sampling_percentage``: Fraction of proxy-scored items sampled for
+  threshold learning. Default: ``0.1``.
+- ``cascade_IS_weight``: Importance-sampling weight. Higher values bias the
+  calibration sample toward high proxy scores; lower values make sampling more
+  uniform. Default: ``0.9``.
+- ``cascade_IS_max_sample_range``: Maximum prefix of proxy-ranked candidates
+  considered for importance sampling. Default: ``200``.
+- ``cascade_IS_random_seed``: Optional random seed for reproducible threshold
+  sampling. Default: ``None``.
+- ``cascade_num_calibration_quantiles``: Number of quantile buckets used to
+  calibrate helper-LM probabilities for filter cascades. Default: ``50``.
+
+Proxy Model Selection
+~~~~~~~~~~~~~~~~~~~~~
+
+``proxy_model`` chooses the cheap model used before routing uncertain cases to
+the main LM.
+
+- ``ProxyModel.HELPER_LM``: Use ``lotus.settings.helper_lm`` as the proxy.
+  This is the default for filter cascades and pairwise-judge cascades.
+- ``ProxyModel.EMBEDDING_MODEL``: Use the configured retrieval model as an
+  embedding proxy where supported.
+
+Filter Cascade Parameters
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+These parameters apply to ``sem_filter`` and pairwise-judge cascades, because
+pairwise judging is implemented through semantic filtering.
+
+- ``helper_filter_instruction``: Optional simplified instruction for the helper
+  LM. If omitted, the helper uses the main filter instruction.
+- ``filter_pos_cascade_threshold``: Optional precomputed positive threshold.
+  Proxy scores at or above this threshold are accepted without the main LM.
+- ``filter_neg_cascade_threshold``: Optional precomputed negative threshold.
+  Proxy scores at or below this threshold are rejected without the main LM.
+
+``filter_pos_cascade_threshold`` and ``filter_neg_cascade_threshold`` must be
+provided together, and the positive threshold must be greater than or equal to
+the negative threshold.
+
+Join Cascade Parameters
+~~~~~~~~~~~~~~~~~~~~~~~
+
+These parameters apply to ``sem_join`` cascades.
+
+- ``min_join_cascade_size``: Minimum full join size before LOTUS considers a
+  join cascade. Default: ``100``.
+- ``map_instruction``: Optional instruction for the map-search-filter join
+  strategy. This maps left rows into likely right-side values before search.
+- ``map_examples``: Optional few-shot examples for ``map_instruction``.
+- ``join_cascade_strategy``: Optional fixed join cascade strategy. Supported
+  values are ``"search_filter"`` and ``"map_search_filter"``. If omitted,
+  LOTUS evaluates both strategies and chooses the cheaper plan.
+- ``join_cascade_pos_threshold``: Optional precomputed positive threshold for
+  join helper scores.
+- ``join_cascade_neg_threshold``: Optional precomputed negative threshold for
+  join helper scores.
+
+If ``join_cascade_strategy`` is provided, both join thresholds must also be
+provided, and the positive threshold must be greater than or equal to the
+negative threshold.
+
+Precomputed Thresholds
+~~~~~~~~~~~~~~~~~~~~~~
+
+Thresholds are usually learned automatically. You can provide them manually
+when you have already calibrated a cascade and want to skip threshold learning.
 
 .. code-block:: python
 
-   cascade_args = CascadeArgs(recall_target=0.9, precision_target=0.9, sampling_percentage=0.5, failure_probability=0.2)
+    cascade_args = CascadeArgs(
+        filter_pos_cascade_threshold=0.62,
+        filter_neg_cascade_threshold=0.52,
+    )
 
-After preparing the arguments, call the semantic operator method on the DataFrame
+For LazyFrame pipelines, :class:`lotus.ast.optimizer.CascadeOptimizer` can
+learn thresholds on training data and store them in the optimized pipeline.
 
-.. code-block:: python
-
-   df, stats = df.sem_filter(user_instruction=user_instruction, cascade_args=cascade_args, return_stats=True)
+Interpreting Filter Statistics
+------------------------------
 
-Note that these parameters guide the trade-off between speed and accuracy when applying the cascade operators
+For cascade operators, ``return_stats=True`` returns metrics that explain how
+much work was handled by the proxy and how much was routed to the main LM.
 
-Interpreting Output Statistics
--------------------------------
-For cascade operators, Output statistics will contain key performance metrics.
-
-An Example output statistic: 
+Example filter stats:
 
 .. code-block:: text
 
-   {'pos_cascade_threshold': 0.62, 
-   'neg_cascade_threshold': 0.52, 
-   'filters_resolved_by_helper_model': 95, 
-   'filters_resolved_by_large_model': 8, 
-   'num_routed_to_helper_model': 95}
+    {
+        "pos_cascade_threshold": 0.62,
+        "neg_cascade_threshold": 0.52,
+        "filters_resolved_by_helper_model": 95,
+        "filters_resolved_by_large_model": 8,
+        "num_routed_to_helper_model": 95,
+        "cascade_args": CascadeArgs(...),
+    }
 
 Here is a detailed explanation of each metric
 
@@ -86,4 +189,8 @@ Here is a detailed explanation of each metric
 
 5. **num_routed_to_helper_model**  
    The total number of items initially processed by the helper model.  
-   Since 95 items were routed, and only 8 required the oracle, this shows a favorable balance between cost and accuracy.
+   Since 95 items were routed, and only 8 required the oracle, this shows a favorable balance between cost and accuracy.
+
+6. **cascade_args**
+   Copy of the cascade configuration, including learned
+  thresholds.