From e74f8dd921b99d0c2c758d9ea4f607969466cafe Mon Sep 17 00:00:00 2001 From: Harshit Gupta Date: Wed, 15 Apr 2026 13:52:19 -0700 Subject: [PATCH 1/3] fix docs --- docs/DirectoryReader.rst | 21 +- docs/approximation_cascades.rst | 169 +++++++++++--- docs/ast.rst | 312 -------------------------- docs/configurations.rst | 33 ++- docs/data_connectors.rst | 23 +- docs/evals.rst | 368 ------------------------------- docs/evaluation.rst | 80 +++++++ docs/evaluation_advanced.rst | 113 ++++++++++ docs/index.rst | 28 ++- docs/installation.rst | 12 +- docs/lazyframe.rst | 242 ++++++++++++++++++++ docs/lazyframe_api.rst | 18 ++ docs/lazyframe_optimizations.rst | 236 ++++++++++++++++++++ docs/llm.rst | 2 +- docs/llm_as_judge.rst | 154 +++++++++++++ docs/pairwise_judge.rst | 149 +++++++++++++ docs/prompt_strategies.rst | 19 +- docs/sem_agg.rst | 222 ++++++++++--------- docs/sem_filter.rst | 298 +++++++++++-------------- docs/sem_join.rst | 239 ++++++++++---------- docs/sem_map.rst | 164 +++++++++----- docs/usage.rst | 14 +- docs/vector_store.rst | 4 +- docs/web_extract.rst | 97 ++++++++ docs/web_search.rst | 314 ++++++++------------------ lotus/ast/lazyframe.py | 6 +- 26 files changed, 1895 insertions(+), 1442 deletions(-) delete mode 100644 docs/ast.rst delete mode 100644 docs/evals.rst create mode 100644 docs/evaluation.rst create mode 100644 docs/evaluation_advanced.rst create mode 100644 docs/lazyframe.rst create mode 100644 docs/lazyframe_api.rst create mode 100644 docs/lazyframe_optimizations.rst create mode 100644 docs/llm_as_judge.rst create mode 100644 docs/pairwise_judge.rst create mode 100644 docs/web_extract.rst diff --git a/docs/DirectoryReader.rst b/docs/DirectoryReader.rst index 72b7bb83..2fdc3dd1 100644 --- a/docs/DirectoryReader.rst +++ b/docs/DirectoryReader.rst @@ -1,10 +1,10 @@ File Loading with DirectoryReader -======================== +================================= Overview --------- The `DirectoryReader` class provides an enhanced, flexible way to ingest and process various document types, including local files, directories, and URLs. -It supports incremental file addition, automatic type detection, URL downloads, and efficient metadata handling, making it seemless to integrate files with LOTUS. +It supports incremental file addition, automatic type detection, URL downloads, and efficient metadata handling, making it seamless to integrate files with LOTUS. Supported File Types -------------------- @@ -15,15 +15,15 @@ Supported File Types - Word files (DOCX, DOC): `per_page` mode is not supported for such files. - Text-based files (`.txt`, `.py`, `.md`, etc.): `per_page` mode is not supported for such files. -Intstallation --------- +Installation +------------ To get started, you will need to install the lotus submodule as follows:: pip install lotus-ai[file_extractor] PDF Example --------- +----------- .. code-block:: python import pathlib @@ -46,7 +46,7 @@ PDF Example print(top_motivating_poems["content"].values[0]) Remote PDF Example --------- +------------------ You can directly download PDFs from URLs and process them seamlessly: .. code-block:: python @@ -62,7 +62,7 @@ You can directly download PDFs from URLs and process them seamlessly: print(f"Loaded PDFs:\n{df[['file_path', 'content']]}") PowerPoint (PPT) Example --------- +------------------------ The `DirectoryReader` class also supports PPT files, downloading and extracting each slide's content into a structured format: .. code-block:: python @@ -76,7 +76,7 @@ The `DirectoryReader` class also supports PPT files, downloading and extracting Chunking -------- -You aslo have the option to chunk the documents. This is useful when you have a large document and you want to process it in smaller chunks. +You also have the option to chunk the documents. This is useful when you have a large document and you want to process it in smaller chunks. You can specify the chunk size and the overlap between the chunks or use the default values of 1000 and 50 respectively. .. code-block:: python @@ -90,7 +90,7 @@ You can specify the chunk size and the overlap between the chunks or use the def Optional Parameters for initializing DirectoryReader --------------------------------- +---------------------------------------------------- - **recursive (bool)**: Whether to recursively search subdirectories. Default is `False`. - **custom_reader_configs (dict)**: Configuration for custom file readers based on file extensions. Currently supports PPT, PPTX and PPTM - **exclude (List[str])**: Patterns of files to exclude. @@ -130,7 +130,7 @@ Available Methods Integration with LOTUS Semantic Operators --------------------- +----------------------------------------- Once you've loaded your data files, you can proceed to seamlessly use LOTUS' semantic operators! .. code-block:: python @@ -138,4 +138,3 @@ Once you've loaded your data files, you can proceed to seamlessly use LOTUS' sem filtered_df = df.sem_filter(user_instruction="Filter instruction here", cascade_args=cascade_args) ranked_df = filtered_df.sem_topk("Ranking instruction here", K=3) print(f"Top Ranked Results:\n{ranked_df[['content']]}") - diff --git a/docs/approximation_cascades.rst b/docs/approximation_cascades.rst index 1b00e871..bce87feb 100644 --- a/docs/approximation_cascades.rst +++ b/docs/approximation_cascades.rst @@ -1,5 +1,5 @@ Optimized Processing with Approximations -======================= +======================================== Overview --------------- @@ -26,45 +26,148 @@ lotus's configuration settings .. code-block:: python - import lotus - from lotus.models import LM - from lotus.types import CascadeArgs - - - gpt_4o_mini = LM("gpt-4o-mini") - gpt_4o = LM("gpt-4o") - - lotus.settings.configure(lm=gpt_4o, helper_lm=gpt_4o_mini) - - -Once the LMs are set up, specify the cascade parameters-like recall and precision targets, sampling percentage, and -the acceptable failure probability-using the CascadeArgs object. + import lotus + from lotus.models import LM + from lotus.types import CascadeArgs, ProxyModel + + lotus.settings.configure( + lm=LM(model="gpt-4o"), + helper_lm=LM(model="gpt-4o-mini"), + ) + + cascade_args = CascadeArgs( + recall_target=0.9, + precision_target=0.9, + sampling_percentage=0.5, + failure_probability=0.2, + proxy_model=ProxyModel.HELPER_LM, + ) + + filtered, stats = df.sem_filter( + user_instruction="{Course Name} requires a lot of math", + cascade_args=cascade_args, + return_stats=True, + ) + +CascadeArgs Parameters +---------------------- + +Accuracy Targets +~~~~~~~~~~~~~~~~ + +These fields describe the quality/cost tradeoff you want LOTUS to target when +it learns thresholds. + +- ``recall_target``: Target recall for the cascade. Increase this when missing + true positives is costly. Default: ``0.8``. +- ``precision_target``: Target precision for the cascade. Increase this when + false positives are costly. Default: ``0.8``. +- ``failure_probability``: Allowed probability that the learned thresholds do + not meet the requested targets. Lower values are more conservative. Default: + ``0.2``. + +Sampling and Calibration +~~~~~~~~~~~~~~~~~~~~~~~~ + +These fields control how LOTUS samples rows or pairs while learning +thresholds. + +- ``sampling_percentage``: Fraction of proxy-scored items sampled for + threshold learning. Default: ``0.1``. +- ``cascade_IS_weight``: Importance-sampling weight. Higher values bias the + calibration sample toward high proxy scores; lower values make sampling more + uniform. Default: ``0.9``. +- ``cascade_IS_max_sample_range``: Maximum prefix of proxy-ranked candidates + considered for importance sampling. Default: ``200``. +- ``cascade_IS_random_seed``: Optional random seed for reproducible threshold + sampling. Default: ``None``. +- ``cascade_num_calibration_quantiles``: Number of quantile buckets used to + calibrate helper-LM probabilities for filter cascades. Default: ``50``. + +Proxy Model Selection +~~~~~~~~~~~~~~~~~~~~~ + +``proxy_model`` chooses the cheap model used before routing uncertain cases to +the main LM. + +- ``ProxyModel.HELPER_LM``: Use ``lotus.settings.helper_lm`` as the proxy. + This is the default for filter cascades and pairwise-judge cascades. +- ``ProxyModel.EMBEDDING_MODEL``: Use the configured retrieval model as an + embedding proxy where supported. + +Filter Cascade Parameters +~~~~~~~~~~~~~~~~~~~~~~~~~ + +These parameters apply to ``sem_filter`` and pairwise-judge cascades, because +pairwise judging is implemented through semantic filtering. + +- ``helper_filter_instruction``: Optional simplified instruction for the helper + LM. If omitted, the helper uses the main filter instruction. +- ``filter_pos_cascade_threshold``: Optional precomputed positive threshold. + Proxy scores at or above this threshold are accepted without the main LM. +- ``filter_neg_cascade_threshold``: Optional precomputed negative threshold. + Proxy scores at or below this threshold are rejected without the main LM. + +``filter_pos_cascade_threshold`` and ``filter_neg_cascade_threshold`` must be +provided together, and the positive threshold must be greater than or equal to +the negative threshold. + +Join Cascade Parameters +~~~~~~~~~~~~~~~~~~~~~~~ + +These parameters apply to ``sem_join`` cascades. + +- ``min_join_cascade_size``: Minimum full join size before LOTUS considers a + join cascade. Default: ``100``. +- ``map_instruction``: Optional instruction for the map-search-filter join + strategy. This maps left rows into likely right-side values before search. +- ``map_examples``: Optional few-shot examples for ``map_instruction``. +- ``join_cascade_strategy``: Optional fixed join cascade strategy. Supported + values are ``"search_filter"`` and ``"map_search_filter"``. If omitted, + LOTUS evaluates both strategies and chooses the cheaper plan. +- ``join_cascade_pos_threshold``: Optional precomputed positive threshold for + join helper scores. +- ``join_cascade_neg_threshold``: Optional precomputed negative threshold for + join helper scores. + +If ``join_cascade_strategy`` is provided, both join thresholds must also be +provided, and the positive threshold must be greater than or equal to the +negative threshold. + +Precomputed Thresholds +~~~~~~~~~~~~~~~~~~~~~~ + +Thresholds are usually learned automatically. You can provide them manually +when you have already calibrated a cascade and want to skip threshold learning. .. code-block:: python - cascade_args = CascadeArgs(recall_target=0.9, precision_target=0.9, sampling_percentage=0.5, failure_probability=0.2) + cascade_args = CascadeArgs( + filter_pos_cascade_threshold=0.62, + filter_neg_cascade_threshold=0.52, + ) -After preparing the arguments, call the semantic operator method on the DataFrame +For LazyFrame pipelines, :class:`lotus.ast.optimizer.CascadeOptimizer` can +learn thresholds on training data and store them in the optimized pipeline. -.. code-block:: python - - df, stats = df.sem_filter(user_instruction=user_instruction, cascade_args=cascade_args, return_stats=True) +Interpreting Filter Statistics +------------------------------ -Note that these parameters guide the trade-off between speed and accuracy when applying the cascade operators +For cascade operators, ``return_stats=True`` returns metrics that explain how +much work was handled by the proxy and how much was routed to the main LM. -Interpreting Output Statistics -------------------------------- -For cascade operators, Output statistics will contain key performance metrics. - -An Example output statistic: +Example filter stats: .. code-block:: text - {'pos_cascade_threshold': 0.62, - 'neg_cascade_threshold': 0.52, - 'filters_resolved_by_helper_model': 95, - 'filters_resolved_by_large_model': 8, - 'num_routed_to_helper_model': 95} + { + "pos_cascade_threshold": 0.62, + "neg_cascade_threshold": 0.52, + "filters_resolved_by_helper_model": 95, + "filters_resolved_by_large_model": 8, + "num_routed_to_helper_model": 95, + "cascade_args": CascadeArgs(...), + } Here is a detailed explanation of each metric @@ -86,4 +189,8 @@ Here is a detailed explanation of each metric 5. **num_routed_to_helper_model** The total number of items initially processed by the helper model. - Since 95 items were routed, and only 8 required the oracle, this shows a favorable balance between cost and accuracy. \ No newline at end of file + Since 95 items were routed, and only 8 required the oracle, this shows a favorable balance between cost and accuracy. + +6. **cascade_args** + Copy of the cascade configuration, including learned + thresholds. diff --git a/docs/ast.rst b/docs/ast.rst deleted file mode 100644 index ba0b3514..00000000 --- a/docs/ast.rst +++ /dev/null @@ -1,312 +0,0 @@ -LazyFrame — Lazy Evaluation for Semantic Pipelines -=================================================== - -The ``LazyFrame`` API lets you compose semantic operator pipelines that are -only executed when you call ``.execute()``. This separation of *definition* -from *execution* enables automatic optimizations, content-addressable caching, -and clean reuse of pipeline logic across datasets. - -.. code-block:: python - - from lotus.ast import LazyFrame - - lf = ( - LazyFrame() - .sem_filter("{text} expresses positive sentiment") - .sem_map("Extract the main topic from {text}") - .sem_topk("Most relevant topics", K=5) - ) - - result = lf.execute(df) # nothing runs until here - - -Why LazyFrame? --------------- - -- **Performance** — Optimizers reorder and tune the pipeline before any LLM - call is made. Predicate pushdown moves cheap pandas filters before - expensive semantic operators; GEPA tunes your natural language instructions - automatically. -- **Caching** — Intermediate results are cached by content so shared - sub-pipelines execute only once, even across joins and nested references. -- **Reusability** — Define a pipeline once, execute it on different datasets. - Persist pipelines to disk with ``save()`` / ``load()``. -- **Inspectability** — ``print_tree()`` shows the full logical plan before - execution. - - -Quick Start ------------ - -1. Configure your model -~~~~~~~~~~~~~~~~~~~~~~~ - -.. code-block:: python - - import lotus - from lotus.models import LM - - lotus.settings.configure(lm=LM(model="gpt-4o-mini")) - -2. Create a LazyFrame -~~~~~~~~~~~~~~~~~~~~~ - -.. code-block:: python - - import pandas as pd - from lotus.ast import LazyFrame - - # Option A: provide data at execution time - lf = LazyFrame() - result = lf.sem_filter("{text} is relevant").execute(df) - - # Option B: bind data directly (no input needed at execution) - lf = LazyFrame(df=df) - result = lf.sem_filter("{text} is relevant").execute({}) - - # Option C: validate input schema at execution time - lf = LazyFrame(schema={"text": "object", "score": "float64"}) - -3. Chain operators -~~~~~~~~~~~~~~~~~~ - -Semantic and pandas operations can be freely mixed: - -.. code-block:: python - - lf = ( - LazyFrame() - .filter(lambda df: df["score"] > 0.5) # pandas filter - .sem_filter("{text} is about machine learning") # semantic filter - .sem_map("Summarize {text} in one sentence") - .assign(upper=lambda df: df["text"].str.upper()) - .head(10) - ) - -4. Execute -~~~~~~~~~~ - -.. code-block:: python - - result = lf.execute(input_df) - -5. Inspect -~~~~~~~~~~ - -.. code-block:: python - - >>> lf.print_tree() - head(10) - -- assign(upper=...) - -- sem_map('Summarize {text} in one sentence') - -- sem_filter('{text} is about machine learning') - -- filter(...) - -- Source(bound=False) - - -Semantic Operators ------------------- - -Every LOTUS semantic operator is available on LazyFrame with the same -parameters as the DataFrame API: - -+------------------------------+--------------------------------------------+ -| Method | Description | -+==============================+============================================+ -| ``sem_filter(instruction)`` | Keep rows matching a language predicate | -+------------------------------+--------------------------------------------+ -| ``sem_map(instruction)`` | Transform each row via language instruction| -+------------------------------+--------------------------------------------+ -| ``sem_extract(in, out)`` | Extract structured attributes into columns | -+------------------------------+--------------------------------------------+ -| ``sem_agg(instruction)`` | Aggregate/summarize across rows | -+------------------------------+--------------------------------------------+ -| ``sem_topk(instruction, K)`` | Rank rows and return top *K* | -+------------------------------+--------------------------------------------+ -| ``sem_join(right, instr)`` | Join on a language predicate | -+------------------------------+--------------------------------------------+ -| ``sem_sim_join(right, ...)`` | Similarity-based join | -+------------------------------+--------------------------------------------+ -| ``sem_search(col, query)`` | Semantic similarity search | -+------------------------------+--------------------------------------------+ -| ``sem_index(col, dir)`` | Build a semantic index | -+------------------------------+--------------------------------------------+ -| ``sem_cluster_by(col, n)`` | Cluster rows semantically | -+------------------------------+--------------------------------------------+ -| ``sem_dedup(col, threshold)``| Deduplicate by semantic similarity | -+------------------------------+--------------------------------------------+ -| ``llm_as_judge(instruction)`` | Judge responses using an LLM | -+------------------------------+--------------------------------------------+ -| ``pairwise_judge(col1, col2, instruction)`` | Compare between two columns | -+-------------------------------------------------------------------------+ - - -Pandas Operations ------------------ - -LazyFrames support standard pandas operations: - -.. code-block:: python - - lf.filter(lambda df: df["score"] > 0.5) # boolean filter - lf.assign(new_col=lambda df: df["a"] + 1) # add columns - lf["col_name"] # select column - lf[["col_a", "col_b"]] # select columns - lf.head(5) # any pandas method - lf.sort_values("score", ascending=False) - - -Multi-Source Pipelines (Joins) ------------------------------- - -For semantic joins, create separate LazyFrame sources and provide data for -each at execution time: - -.. code-block:: python - - courses_lf = LazyFrame() - skills_lf = LazyFrame() - - pipeline = courses_lf.sem_join( - skills_lf, - "Taking {Course Name} will help learn {Skill}", - how="inner", - ) - - result = pipeline.execute({ - courses_lf: courses_df, - skills_lf: skills_df, - }) - - -Combining LazyFrames --------------------- - -``concat`` and ``from_fn`` -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code-block:: python - - lf1 = LazyFrame() - lf2 = LazyFrame() - - # Concatenate - combined = LazyFrame.concat([lf1, lf2], ignore_index=True) - result = combined.execute({lf1: df_a, lf2: df_b}) - - # Arbitrary function - def merge_and_clean(left, right): - return pd.concat([left, right]).drop_duplicates(subset=["id"]) - - lf = LazyFrame.from_fn(merge_and_clean, lf1, lf2) - result = lf.execute({lf1: df1, lf2: df2}) - - -Optimizations -------------- - -Call ``optimize()`` to apply one or more optimizers before execution: - -.. code-block:: python - - optimized = lf.optimize([optimizer1, optimizer2]) - result = optimized.execute(df) - -Available optimizers: - -+------------------------------------+-----------------------------------------------------------+---------------+ -| Optimizer | Description | Applied by | -+====================================+===========================================================+===============+ -| ``PredicatePushdownOptimizer`` | Moves pandas filters before semantic operators to reduce | Automatically | -| | the number of rows processed by expensive LLM calls. | | -+------------------------------------+-----------------------------------------------------------+---------------+ -| ``GEPAOptimizer`` | LLM-guided evolutionary search that tunes natural | Manually | -| | language instructions for better task performance. | | -+------------------------------------+-----------------------------------------------------------+---------------+ -| ``CascadeOptimizer`` | Saves learned cascade thresholds so subsequent executions | Manually | -| | skip the threshold-learning phase. | | -+------------------------------------+-----------------------------------------------------------+---------------+ - -``PredicatePushdownOptimizer`` is included in ``DEFAULT_OPTIMIZERS`` and runs -automatically when ``optimize()`` is called. Pass -``auto_include_default_optimizer=False`` to skip it. - -GEPA — Automatic Prompt Optimization -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -GEPA uses LLM-guided evolutionary search to tune the natural language -instructions in your pipeline. Provide a scoring function and training data: - -.. code-block:: python - - from lotus.ast.optimizer import GEPAOptimizer - - def eval_fn(output_df, example): - """Score: fraction of positive reviews correctly kept.""" - kept = set(output_df.index) - tp = len(POSITIVE_INDICES & kept) - precision = tp / max(len(kept), 1) - recall = tp / max(len(POSITIVE_INDICES), 1) - f1 = 2 * precision * recall / max(precision + recall, 1e-9) - return f1, {"precision": precision, "recall": recall} - - optimizer = GEPAOptimizer(eval_fn=eval_fn) - lf = LazyFrame(df=df).sem_filter("{review} is a positive review") - optimized = lf.optimize([optimizer], train_data=df) - -The ``eval_fn`` receives ``(output_df, example)`` and returns a float score -(higher is better), optionally with a side-info dict for GEPA's reflection. - -Control which parameters are optimized per node: - -.. code-block:: python - - lf = ( - LazyFrame(df=df) - .sem_filter("{text} is relevant", mark_optimizable=["user_instruction"]) - .sem_map("Clean {text}", mark_optimizable=[]) # excluded - ) - -Saving Optimization State -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Use ``CascadeOptimizer`` to learn cascade thresholds on training data and -save the optimization state. Subsequent executions reuse the saved -thresholds, skipping the threshold-learning phase: - -.. code-block:: python - - from lotus.ast.optimizer import CascadeOptimizer - - optimized = lf.optimize([CascadeOptimizer()], train_data=df) - optimized.save("optimized_pipeline.pkl") # state is persisted - - - -Persistence ------------ - -Save and load pipelines for reuse: - -.. code-block:: python - - lf.save("pipeline.pkl") - - loaded = LazyFrame.load("pipeline.pkl") - result = loaded.execute(new_df) - -Pipelines with custom callables (lambdas, closures) are not portable across -environments. - -API Reference -------------- - -.. automodule:: lotus.ast.lazyframe - :members: - :undoc-members: - :show-inheritance: - -.. automodule:: lotus.ast.optimizer - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/configurations.rst b/docs/configurations.rst index b543c280..65e0787f 100644 --- a/docs/configurations.rst +++ b/docs/configurations.rst @@ -3,14 +3,14 @@ Setting Configurations Overview --------- -The Settings module let's you manage application-wide configurations. -In most examples seen, we have used the settings to configured our LM. +The Settings module lets you manage application-wide configurations. +Most examples use settings to configure the active LM. Using the Settings module -------------------------- .. code-block:: python - from lotus + import lotus from lotus.models import LM lm = LM(model="gpt-4o-mini") @@ -19,14 +19,11 @@ Using the Settings module Configurable Parameters -------------------------- -1. enable_cache: - * Description: Enables or Disables caching mechanisms - * Default: False - * Parameters: - - cache_type: Type of caching (SQLITE or In_MEMORY) - - max_size: maximum size of cache - - cache_dir: Directory for where DB file is stored. Default: "~/.lotus/cache" - * Note: It is recommended to enable caching +``enable_cache`` + Enables or disables caching mechanisms. Default: ``False``. + Cache parameters include ``cache_type``, ``max_size``, and ``cache_dir``. + It is recommended to enable caching. + .. code-block:: python import pandas as pd @@ -41,17 +38,18 @@ Configurable Parameters lm = LM(model='gpt-4o-mini', cache=cache) lotus.settings.configure(lm=lm, enable_cache=True) -2. setting RM: - * Description: Configures the retrieval model - * Default: None +``rm`` + Configures the retrieval model. Default: ``None``. + .. code-block:: python rm = SentenceTransformersRM(model="intfloat/e5-base-v2") lotus.settings.configure(rm=rm) -3. setting helper_lm: - * Descriptions: Configures secondary helper LM often set along with primary LM - * Default: None +``helper_lm`` + Configures a secondary helper LM, often used with cascades. Default: + ``None``. + .. code-block:: python gpt_4o_mini = LM("gpt-4o-mini") @@ -162,4 +160,3 @@ are invisible to others. ``configure()`` mutates the global settings object and is **not** thread-safe. Use ``context()`` whenever settings need to differ across concurrent execution paths. - diff --git a/docs/data_connectors.rst b/docs/data_connectors.rst index acff15f3..a41d1a10 100644 --- a/docs/data_connectors.rst +++ b/docs/data_connectors.rst @@ -1,23 +1,23 @@ Database Connectors -================= +=================== Overview --------- LOTUS' data connectors let you seamlessly load data from external stores (e.g. a SQL database) so that you can run LOTUS programs over them. -Current data connections include SQL databases supported by `SQLAlchemy`_ and any S3 serivice. +Current data connections include SQL databases supported by `SQLAlchemy`_ and any S3 service. .. _SQLAlchemy: https://docs.sqlalchemy.org/en/14/dialects/ -Intstallation --------- +Installation +------------ To get started, you will need to install the lotus submodule as follows:: pip install lotus[data_connectors] Example: Loading from SQLite ------------ +---------------------------- .. code-block:: python import lotus @@ -35,7 +35,7 @@ Example: Loading from SQLite print(df) Example: Loading from Postgres ------------- +------------------------------ .. code-block:: python import lotus @@ -53,7 +53,7 @@ Example: Loading from Postgres print(df) Example: Loading from Snowflake ---------------- +------------------------------- .. code-block:: python import lotus @@ -71,7 +71,7 @@ Example: Loading from Snowflake print(df) Example: Loading from Google Big Query --------------------------- +-------------------------------------- .. code-block:: python import lotus @@ -89,7 +89,7 @@ Example: Loading from Google Big Query print(df) Example: Loading from S3 ------------ +------------------------ .. code-block:: python import lotus @@ -136,8 +136,8 @@ Required DB Parameters - **connection_url** : The connection url for the database - **query** : The query to execute -Required s3 Paramaters ------------------------ +Required s3 Parameters +---------------------- - **aws_access_key** : The AWS access key (None for Public Access) - **aws_secret_key** : The AWS secret key (None for Public Access) - **region** : The AWS region @@ -145,4 +145,3 @@ Required s3 Paramaters - **file_path** : The path to the file in S3 - **endpoint_url** : The Minio endpoint URL. Default is None for AWS s3 - **protocol** : The protocol to use (http for Minio and https for R2). Default is "s3" - diff --git a/docs/evals.rst b/docs/evals.rst deleted file mode 100644 index 2a2015ef..00000000 --- a/docs/evals.rst +++ /dev/null @@ -1,368 +0,0 @@ -LLM-based Evaluation Suite -=================== - -Overview --------- -LOTUS provides a comprehensive evaluation framework instantiating LLM-as-a-Judge methods. The evaluation module supports both single response evaluation and pairwise comparisons, making it ideal for model evaluation, response quality assessment, and A/B testing scenarios. - -The evaluation framework includes two main components: - -- **LLM-as-Judge**: Evaluate individual responses using customizable criteria -- **Pairwise Judge**: Compare two responses side-by-side to determine which is better - -Key Features ------------- - -- **Flexible Evaluation Criteria**: Define custom judging instructions in natural language -- **Structured Output Support**: Use Pydantic models for consistent, structured evaluation results -- **Position Bias Mitigation**: Built-in column permutation to reduce ordering effects in pairwise comparisons -- **Multiple Trial Support**: Run multiple evaluation trials for improved reliability -- **Chain-of-Thought Reasoning**: Optional reasoning strategies for more explainable evaluations -- **Integration with LOTUS**: Seamless integration with other LOTUS semantic operators - -LLM-as-Judge -============ - -The LLM-as-Judge functionality allows you to evaluate individual responses using natural language instructions. - -Basic Usage ------------ - -.. code-block:: python - - import pandas as pd - import lotus - from lotus.models import LM - - # Configure the language model - lm = LM(model="gpt-4o-mini") - lotus.settings.configure(lm=lm) - - # Sample data representing responses to evaluate - data = { - "student_id": [1, 2, 3, 4], - "question": [ - "Explain the difference between supervised and unsupervised learning", - "What is the purpose of cross-validation in machine learning?", - "Describe how gradient descent works", - "What are the advantages of ensemble methods?" - ], - "answer": [ - "Supervised learning uses labeled data to train models, while unsupervised learning finds patterns in unlabeled data. For example, classification is supervised, clustering is unsupervised.", - "Gradient descent is an optimization algorithm that minimizes cost functions by iteratively moving in the direction of steepest descent of the gradient.", - "Cross-validation helps assess model performance by splitting data into training and validation sets multiple times to get a better estimate of how the model generalizes.", - "Ensemble methods combine multiple models to improve performance. They reduce overfitting and variance, often leading to better generalization than individual models." - ] - } - - df = pd.DataFrame(data) - - # Define evaluation criteria - judge_instruction = "Rate the accuracy and completeness of this {answer} to the {question} on a scale of 1-10, where 10 is excellent. Only output the score." - - # Run evaluation - results = df.llm_as_judge( - judge_instruction=judge_instruction, - n_trials=2, # Run multiple trials for reliability - ) - - print(results) - -Structured Output with Response Formats ---------------------------------------- - -For more detailed and consistent evaluations, use Pydantic models to define structured output formats: - -.. code-block:: python - - from pydantic import BaseModel, Field - - class EvaluationScore(BaseModel): - score: int = Field(description="Score from 1-10") - reasoning: str = Field(description="Detailed reasoning for the score") - strengths: list[str] = Field(description="Key strengths of the answer") - improvements: list[str] = Field(description="Areas for improvement") - - # Use structured output format - results = df.llm_as_judge( - judge_instruction="Evaluate the student {answer} for the {question}", - response_format=EvaluationScore, - suffix="_evaluation", - ) - - # Access structured fields - for idx, row in results.iterrows(): - evaluation = row['_evaluation_0'] - print(f"Score: {evaluation.score}") - print(f"Reasoning: {evaluation.reasoning}") - print(f"Strengths: {evaluation.strengths}") - print(f"Improvements: {evaluation.improvements}") - -Pairwise Judge -============== - -The Pairwise Judge functionality enables side-by-side comparison of two responses to determine which is better according to specified criteria. - -Basic Pairwise Comparison -------------------------- - -.. code-block:: python - - import pandas as pd - import lotus - from lotus.models import LM - - # Configure the language model - lm = LM(model="gpt-4o-mini") - lotus.settings.configure(lm=lm) - - # Example dataset with prompts and two candidate responses - data = { - "prompt": [ - "Write a one-sentence summary of the benefits of regular exercise.", - "Explain the difference between supervised and unsupervised learning in one sentence.", - "Suggest a polite email subject line to schedule a 1:1 meeting.", - ], - "model_a": [ - "Regular exercise improves physical health and mental well-being by boosting energy, mood, and resilience.", - "Supervised learning uses labeled data to learn mappings, while unsupervised learning finds patterns without labels.", - "Meeting request.", - ], - "model_b": [ - "Exercise is good.", - "Supervised learning and unsupervised learning are both machine learning approaches.", - "Requesting a 1:1: finding time to connect next week?", - ], - } - - df = pd.DataFrame(data) - - # Define comparison criteria - judge_instruction = ( - "Given the prompt {prompt}, compare the two responses.\\n" - "- Response A: {model_a}\\n" - "- Response B: {model_b}\\n\\n" - "Choose the better response based on helpfulness, correctness, and clarity. " - "Output only 'A' or 'B' or 'Tie' if the responses are equally good." - ) - - # Run pairwise evaluation - results = df.pairwise_judge( - col1="model_a", - col2="model_b", - judge_instruction=judge_instruction, - n_trials=2, - permute_cols=True, # Mitigate position bias by evaluating both (A,B) and (B,A) - ) - - print(results) - -Position Bias Mitigation ------------------------- - -Position bias occurs when judges systematically prefer responses in certain positions (e.g., always preferring the first response). The ``permute_cols`` parameter helps mitigate this: - -.. code-block:: python - - # This will evaluate both (model_a, model_b) and (model_b, model_a) orderings - results = df.pairwise_judge( - col1="model_a", - col2="model_b", - judge_instruction=judge_instruction, - n_trials=4, # Must be even when permute_cols=True - permute_cols=True, - ) - - -Advanced Features -================= - -Chain-of-Thought Reasoning ---------------------------- - -Enable chain-of-thought reasoning for more explainable evaluations: - -.. code-block:: python - - from lotus.types import ReasoningStrategy - - results = df.llm_as_judge( - judge_instruction="Evaluate the quality of this {answer}", - strategy=ReasoningStrategy.COT, # Enable chain-of-thought - n_trials=1, - ) - - results = df.pairwise_judge( - col1="model_a", - col2="model_b", - judge_instruction=judge_instruction, - n_trials=4, # Must be even when permute_cols=True - permute_cols=True, - strategy=ReasoningStrategy.COT, - ) - -Few-Shot Learning ------------------ - -Provide examples to guide the evaluation process: - -.. code-block:: python - - # Create examples DataFrame - examples_data = { - "question": ["What is machine learning?"], - "answer": ["Machine learning is a subset of AI that enables computers to learn from data."], - "Answer": ["8"] # Expected score - note the capital 'A' - } - examples_df = pd.DataFrame(examples_data) - - # Use examples in evaluation - results = df.llm_as_judge( - judge_instruction="Rate this {answer} to the {question} from 1-10", - examples=examples_df, - ) - -Custom System Prompts ---------------------- - -Customize the system prompt for specific evaluation contexts: - -.. code-block:: python - - custom_system_prompt = ( - "You are an expert educator with 20 years of experience in computer science. " - "Evaluate student responses with attention to technical accuracy and clarity." - ) - - results = df.llm_as_judge( - judge_instruction="Evaluate this {answer}", - system_prompt=custom_system_prompt, - ) - -API Reference -============= - -llm_as_judge ------------- - -.. function:: DataFrame.llm_as_judge(judge_instruction, response_format=None, n_trials=1, system_prompt=None, suffix="_judge", examples=None, strategy=None, safe_mode=False, **model_kwargs) - - Evaluate responses using LLM-as-Judge methodology. - - :param judge_instruction: Natural language instruction for evaluation. Use {column_name} to reference DataFrame columns. - :type judge_instruction: str - :param response_format: Pydantic model for structured output. If None, returns string. - :type response_format: BaseModel | None - :param n_trials: Number of evaluation trials to run. - :type n_trials: int - :param system_prompt: Custom system prompt for the judge. - :type system_prompt: str | None - :param suffix: Suffix for output column names. - :type suffix: str - :param examples: Example DataFrame for few-shot learning. Must include "Answer" column. - :type examples: pd.DataFrame | None - :param strategy: Reasoning strategy (None, COT, ZS_COT). - :type strategy: ReasoningStrategy | None - :param safe_mode: Enable cost estimation before execution. - :type safe_mode: bool - :param model_kwargs: Additional arguments passed to the language model. - :return: DataFrame with original data plus evaluation results. - :rtype: pd.DataFrame - -pairwise_judge --------------- - -.. function:: DataFrame.pairwise_judge(col1, col2, judge_instruction, response_format=None, n_trials=1, permute_cols=False, system_prompt=None, suffix="_judge", examples=None, strategy=None, safe_mode=False, **model_kwargs) - - Compare two responses using pairwise evaluation. - - :param col1: Name of the first column to compare. - :type col1: str - :param col2: Name of the second column to compare. - :type col2: str - :param judge_instruction: Natural language instruction for comparison. Use {column_name} to reference DataFrame columns. - :type judge_instruction: str - :param response_format: Pydantic model for structured output. If None, returns string. - :type response_format: BaseModel | None - :param n_trials: Number of evaluation trials to run. - :type n_trials: int - :param permute_cols: Whether to permute column order to mitigate position bias. If True, n_trials must be even. - :type permute_cols: bool - :param system_prompt: Custom system prompt for the judge. - :type system_prompt: str | None - :param suffix: Suffix for output column names. - :type suffix: str - :param examples: Example DataFrame for few-shot learning. Must include "Answer" column. - :type examples: pd.DataFrame | None - :param strategy: Reasoning strategy (None, COT, ZS_COT). - :type strategy: ReasoningStrategy | None - :param safe_mode: Enable cost estimation before execution. - :type safe_mode: bool - :param model_kwargs: Additional arguments passed to the language model. - :return: DataFrame with original data plus comparison results. - :rtype: pd.DataFrame - -Best Practices -============== - -Evaluation Design ------------------ - -1. **Clear Instructions**: Write specific, unambiguous evaluation criteria -2. **Multiple Trials**: Use multiple trials to improve reliability and account for model variability -3. **Position Bias**: Use ``permute_cols=True`` in pairwise comparisons to mitigate ordering effects -4. **Structured Output**: Use Pydantic models for consistent, parseable results -5. **Appropriate Models**: Choose models with strong reasoning capabilities for complex evaluations - -Performance Considerations --------------------------- - -1. **Batch Size**: Larger DataFrames will result in more API calls -2. **Model Selection**: Balance evaluation quality with cost and latency -3. **Safe Mode**: Enable safe mode for cost estimation on large datasets -4. **Caching**: LOTUS automatically caches results to avoid redundant evaluations - -Common Patterns ---------------- - -**A/B Testing**: - -.. code-block:: python - - # Compare two model versions - results = df.pairwise_judge( - col1="model_v1_output", - col2="model_v2_output", - judge_instruction="Which response better answers {user_query}?", - permute_cols=True, - n_trials=4 - ) - -**Content Moderation**: - -.. code-block:: python - - class ModerationResult(BaseModel): - is_safe: bool = Field(description="Whether the content is safe") - risk_level: str = Field(description="Risk level: low, medium, high") - reasoning: str = Field(description="Explanation for the decision") - - results = df.llm_as_judge( - judge_instruction="Evaluate if this {content} is safe for a general audience", - response_format=ModerationResult - ) - -**Response Quality Assessment**: - -.. code-block:: python - - class QualityScore(BaseModel): - helpfulness: int = Field(description="Helpfulness score 1-10") - accuracy: int = Field(description="Accuracy score 1-10") - clarity: int = Field(description="Clarity score 1-10") - overall: int = Field(description="Overall score 1-10") - - results = df.llm_as_judge( - judge_instruction="Evaluate the quality of this {response} to {question}", - response_format=QualityScore - ) diff --git a/docs/evaluation.rst b/docs/evaluation.rst new file mode 100644 index 00000000..492ac211 --- /dev/null +++ b/docs/evaluation.rst @@ -0,0 +1,80 @@ +Evaluation Suite +================ + +LOTUS includes LLM-as-judge tools for evaluating model outputs, application +responses, and content quality directly from pandas DataFrames. + +The evaluation suite has two DataFrame accessors: + +- ``llm_as_judge`` evaluates each row independently. +- ``pairwise_judge`` compares two response columns and chooses the better + response for each row. + +Setup +----- + +.. code-block:: python + + import pandas as pd + import lotus + from lotus.models import LM + + lm = LM(model="gpt-4o-mini") + lotus.settings.configure(lm=lm) + + df = pd.DataFrame({ + "question": [ + "What is cross-validation?", + "What is gradient descent?", + ], + "answer": [ + "Cross-validation estimates generalization by evaluating on held-out splits.", + "Gradient descent iteratively updates parameters to reduce a loss function.", + ], + }) + +Choose the Right Evaluator +-------------------------- + +Use ``llm_as_judge`` when each row has one response to score, classify, or +annotate. + +.. code-block:: python + + scored = df.llm_as_judge( + "Rate the accuracy of {answer} for {question} from 1 to 10. " + "Return only the score." + ) + +Use ``pairwise_judge`` when each row has two responses and you want a direct +comparison. + +.. code-block:: python + + pairwise_df = pd.DataFrame({ + "question": ["What is cross-validation?"], + "model_a": ["It evaluates a model on several held-out splits."], + "model_b": ["It checks whether a model knows the answer."], + }) + + compared = pairwise_df.pairwise_judge( + col1="model_a", + col2="model_b", + judge_instruction="Which response better answers {question}?", + permute_cols=True, + n_trials=2, + ) + +Caching Behavior +---------------- + +Evaluation calls temporarily disable LOTUS operator caching inside the judge +loop so repeated trials can produce independent judgments. The global cache +setting is restored after the evaluation call finishes. + +Related Pages +------------- + +- :doc:`llm_as_judge` +- :doc:`pairwise_judge` +- :doc:`evaluation_advanced` diff --git a/docs/evaluation_advanced.rst b/docs/evaluation_advanced.rst new file mode 100644 index 00000000..2f649ba4 --- /dev/null +++ b/docs/evaluation_advanced.rst @@ -0,0 +1,113 @@ +Evaluation Advanced Features +============================ + +This page collects evaluation features that apply across the evaluation suite. + +Reasoning Strategies +-------------------- + +Use ``ReasoningStrategy.COT`` or ``ReasoningStrategy.ZS_COT`` when you want +chain-of-thought style reasoning from the judge. + +.. code-block:: python + + from lotus.types import ReasoningStrategy + + results = df.llm_as_judge( + "Evaluate the quality of {answer} for {question}.", + strategy=ReasoningStrategy.COT, + return_explanations=True, + ) + +Reasoning strategies cannot be combined with ``response_format`` in +``llm_as_judge``. For structured outputs with reasoning, add a reasoning field +to the Pydantic response model and do not set a CoT strategy. + +Structured Output +----------------- + +``llm_as_judge`` accepts a Pydantic ``response_format``. + +.. code-block:: python + + from pydantic import BaseModel, Field + + class SafetyResult(BaseModel): + is_safe: bool = Field(description="Whether the content is safe") + risk_level: str = Field(description="low, medium, or high") + reasoning: str = Field(description="Explanation for the decision") + + results = df.llm_as_judge( + "Evaluate whether {content} is safe for a general audience.", + response_format=SafetyResult, + ) + +Few-Shot Examples +----------------- + +Both evaluation accessors accept ``examples`` DataFrames. Include the same +input columns as the evaluated DataFrame plus an ``Answer`` column. + +.. code-block:: python + + examples = pd.DataFrame({ + "question": ["What is gradient descent?"], + "answer": ["An optimization method that follows the loss gradient."], + "Answer": ["9"], + }) + + results = df.llm_as_judge( + "Rate {answer} for {question} from 1 to 10.", + examples=examples, + ) + +If the examples are used with ``ReasoningStrategy.COT``, include a +``Reasoning`` column. + +Custom System Prompts +--------------------- + +Use ``system_prompt`` to set judge role, rubric context, or domain expertise. + +.. code-block:: python + + results = df.llm_as_judge( + "Evaluate {answer} for {question}.", + system_prompt=( + "You are an expert computer science instructor. " + "Grade for correctness, completeness, and clarity." + ), + ) + +Pairwise Cascades +----------------- + +``pairwise_judge`` supports filter cascades through ``cascade_args`` and +``helper_examples``. This routes confident comparisons through a helper model +and sends uncertain comparisons to the main LM. + +.. code-block:: python + + from lotus.types import CascadeArgs + + cascade_args = CascadeArgs( + recall_target=0.9, + precision_target=0.9, + sampling_percentage=0.5, + failure_probability=0.2, + ) + + results, stats = df.pairwise_judge( + "model_a", + "model_b", + "Which response better answers {question}?", + cascade_args=cascade_args, + return_stats=True, + ) + +Cache Isolation +--------------- + +Evaluation trials disable LOTUS operator caching while the judge calls run. +This prevents repeated trials from returning cached judgments. LOTUS restores +the original cache setting after evaluation completes. diff --git a/docs/index.rst b/docs/index.rst index 6b594909..99869bb5 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -8,7 +8,7 @@ :height: 170px :align: center -LOTUS Makes LLM-Powerd Data Processing Fast, Easy and Robust +LOTUS Makes LLM-Powered Data Processing Fast, Easy and Robust ================================================================================= LOTUS implements the semantic operator programming model and provides an optimized query engine for serving AI-based query pipelines over your data. @@ -46,11 +46,31 @@ LOTUS implements the semantic operator programming model and provides an optimiz sem_index sem_dedup web_search + web_extract .. toctree:: :hidden: :maxdepth: 1 - :caption: Models + :caption: LazyFrame + + lazyframe + lazyframe_optimizations + lazyframe_api + +.. toctree:: + :hidden: + :maxdepth: 1 + :caption: Evaluation Suite + + evaluation + llm_as_judge + pairwise_judge + evaluation_advanced + +.. toctree:: + :hidden: + :maxdepth: 1 + :caption: Models llm retriever_models @@ -68,8 +88,6 @@ LOTUS implements the semantic operator programming model and provides an optimiz prompt_strategies configurations reasoning_models - evals - ast .. toctree:: :hidden: @@ -78,5 +96,3 @@ LOTUS implements the semantic operator programming model and provides an optimiz data_connectors DirectoryReader - - diff --git a/docs/installation.rst b/docs/installation.rst index e89c2b79..93d4518f 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -14,7 +14,7 @@ Install with uv (Recommended) For the latest stable release: -.. code-block:: console +.. code-block:: bash # Install uv if you haven't already $ curl -LsSf https://astral.sh/uv/install.sh | sh @@ -24,7 +24,7 @@ For the latest stable release: For the latest features: -.. code-block:: console +.. code-block:: bash $ uv add git+https://github.com/lotus-data/lotus.git@main @@ -33,7 +33,7 @@ Install with pip For the latest stable release: -.. code-block:: console +.. code-block:: bash $ conda create -n lotus python=3.10 -y $ conda activate lotus @@ -41,7 +41,7 @@ For the latest stable release: For the latest features: -.. code-block:: console +.. code-block:: bash $ conda create -n lotus python=3.10 -y $ conda activate lotus @@ -52,13 +52,13 @@ Optional Subpackages LOTUS supports optional subpackages for extended functionality. Install them using the ``lotus-ai[]`` syntax: -.. code-block:: console +.. code-block:: bash $ pip install "lotus-ai[serpapi]" Or with uv: -.. code-block:: console +.. code-block:: bash $ uv add "lotus-ai[serpapi]" diff --git a/docs/lazyframe.rst b/docs/lazyframe.rst new file mode 100644 index 00000000..507f92fe --- /dev/null +++ b/docs/lazyframe.rst @@ -0,0 +1,242 @@ +LazyFrame API +============= + +``LazyFrame`` is LOTUS' lazy execution API for semantic operator programs. It +lets you define a pipeline first, then execute it later on a DataFrame. Nothing +runs until you call ``execute()``. + +Why LazyFrame? +-------------- + +Eager LOTUS execution is useful when you are exploring data and want each +operator to run immediately, just like pandas. LazyFrame is useful when you +have a multi-step LLM program and want LOTUS to see the whole plan before any +expensive model calls happen. + +That global plan makes several things possible: + +- inspect the semantic and pandas operations that will run +- move cheap pandas filters before expensive semantic filters +- optimize prompts across the whole pipeline instead of one operator at a time +- pre-learn cascade thresholds so cheaper models can handle easy rows +- save an optimized pipeline and reuse it in a later session + +In other words, LazyFrame gives LOTUS the same kind of planning boundary that +a query engine has: you describe what should happen, then LOTUS decides how to +execute it efficiently. + +What You Can Build +------------------ + +LazyFrame is useful for LLM-based data workflows where the result depends on a +pipeline rather than a single prompt. Examples include: + +- filtering agent traces and aggregating the failures into a taxonomy +- running LLM-as-judge or pairwise-judge evaluations over model outputs +- building RAG-style pipelines that search, transform, and aggregate evidence +- extracting structured tables from long documents or web pages +- combining semantic operators with pandas cleanup, grouping, and slicing + +Quick Start +----------- + +This example builds a semantic filter pipeline over GitHub-style issue titles. +The pipeline is defined first and executed later. + +.. code-block:: python + + import pandas as pd + import lotus + from lotus.ast import LazyFrame + from lotus.models import LM + + lm = LM(model="gpt-4.1-nano") + lotus.settings.configure(lm=lm) + + issues = pd.DataFrame({ + "issue_title": [ + "Fix typo in README", + "Add dark mode support to dashboard", + "Refactor entire auth system to use OAuth2", + "Update copyright year in LICENSE", + "Implement distributed transaction support across microservices", + "Change button color on settings page", + "Migrate database from Postgres 13 to 16 with zero downtime", + "Add missing comma in error message", + "Build custom query planner to replace third-party dependency", + "Bump lodash to fix known CVE", + "Support multi-region active-active replication", + "Remove unused import in utils.py", + ] + }) + + pipeline = LazyFrame().sem_filter( + "The {issue_title} describes a small, self-contained task that a new " + "open source contributor could tackle without deep knowledge of the codebase" + ) + + good_first_issues = pipeline.execute(issues) + +Output: + ++----+----------------------------------------------+ +| | issue_title | ++====+==============================================+ +| 0 | Fix typo in README | ++----+----------------------------------------------+ +| 3 | Update copyright year in LICENSE | ++----+----------------------------------------------+ +| 5 | Change button color on settings page | ++----+----------------------------------------------+ +| 7 | Add missing comma in error message | ++----+----------------------------------------------+ +| 9 | Bump lodash to fix known CVE | ++----+----------------------------------------------+ +| 11 | Remove unused import in utils.py | ++----+----------------------------------------------+ + +This has the same user-facing result as eager ``issues.sem_filter(...)``, but +the lazy version can also be inspected, optimized, saved, and reused. + +How Lazy Execution Works +------------------------ + +Each LazyFrame operation appends a node to a logical plan. Semantic operators, +pandas operations, evaluation operators, joins, and custom functions are all +represented in that plan. When you call ``execute()``, LOTUS walks the plan and +materializes the final DataFrame. + +You can inspect the plan before execution: + +.. code-block:: python + + pipeline.print_tree() + +Output: + +.. code-block:: text + + sem_filter('The {issue_title} describes a small, self-containe...') + -- Source(bound=False) + +This is useful when a pipeline has multiple semantic operators or nested +LazyFrames and you want to confirm the execution plan before spending LM calls. + +Source Data +----------- + +You can pass data at execution time, bind it when constructing the LazyFrame, +or provide a schema that is checked at execution time. + +.. code-block:: python + + # Pass data at execution time. + pipeline = LazyFrame().sem_filter("{issue_title} is documentation-only") + result = pipeline.execute(issues) + + # Bind data in the LazyFrame. + pipeline = LazyFrame(df=issues).sem_filter("{issue_title} is documentation-only") + result = pipeline.execute({}) + + # Validate execution input. + pipeline = LazyFrame(schema={"issue_title": "object"}).sem_filter( + "{issue_title} is documentation-only" + ) + result = pipeline.execute(issues) + +Chaining Operators +------------------ + +LazyFrame supports LOTUS semantic operators and common pandas operations in the +same pipeline. + +.. code-block:: python + + pipeline = ( + LazyFrame() + .assign(title_length=lambda df: df["issue_title"].str.len()) + .filter(lambda df: df["title_length"] < 80) + .sem_filter("{issue_title} is a good first issue") + .sem_map("Summarize {issue_title} as a contributor task", suffix="_task") + .head(5) + ) + +The semantic operator methods mirror the DataFrame API, including +``sem_filter``, ``sem_map``, ``sem_extract``, ``sem_agg``, ``sem_topk``, +``sem_join``, ``sem_sim_join``, ``sem_search``, ``sem_index``, +``load_sem_index``, ``sem_cluster_by``, ``sem_dedup``, and +``sem_partition_by``. LazyFrame also supports evaluation operators: +``llm_as_judge`` and ``pairwise_judge``. + +Multi-Source Pipelines +---------------------- + +For one source, pass a DataFrame directly to ``execute()``. + +.. code-block:: python + + result = pipeline.execute(issues) + +For multiple sources, create one source LazyFrame per input and pass a +dictionary keyed by those source objects. + +.. code-block:: python + + issues_lf = LazyFrame() + labels_lf = LazyFrame() + + joined = issues_lf.sem_join( + labels_lf, + "The issue {issue_title:left} should receive the label {label:right}", + ) + + result = joined.execute({ + issues_lf: issues, + labels_lf: labels, + }) + +Composition +----------- + +Use ``LazyFrame.concat`` to combine LazyFrame results and ``LazyFrame.from_fn`` +when you need to apply a custom callable after one or more LazyFrames are +resolved. + +.. code-block:: python + + docs = LazyFrame().sem_filter("{issue_title} is about documentation") + frontend = LazyFrame().sem_filter("{issue_title} is about UI work") + + combined = LazyFrame.concat([docs, frontend], ignore_index=True) + result = combined.execute({docs: issues, frontend: issues}) + +.. code-block:: python + + def dedupe_by_title(df): + return df.drop_duplicates(subset=["issue_title"]) + + deduped = LazyFrame.from_fn(dedupe_by_title, combined) + result = deduped.execute({docs: issues, frontend: issues}) + +Persistence +----------- + +Save and load pipelines with ``save()`` and ``LazyFrame.load()``. This is most +useful after optimization, because the optimized instructions and learned +cascade thresholds are stored with the pipeline. + +.. code-block:: python + + pipeline.save("good_first_issue_pipeline.pkl") + + loaded = LazyFrame.load("good_first_issue_pipeline.pkl") + result = loaded.execute(issues) + +Pipelines that include local callables, lambdas, or closures may not be +portable across Python environments because they are serialized with pickle. + +Related Pages +------------- + +- :doc:`lazyframe_optimizations` +- :doc:`lazyframe_api` diff --git a/docs/lazyframe_api.rst b/docs/lazyframe_api.rst new file mode 100644 index 00000000..26bd8b58 --- /dev/null +++ b/docs/lazyframe_api.rst @@ -0,0 +1,18 @@ +LazyFrame API Reference +======================= + +LazyFrame +--------- + +.. automodule:: lotus.ast.lazyframe + :members: + :undoc-members: + :show-inheritance: + +Optimizers +---------- + +.. automodule:: lotus.ast.optimizer + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/lazyframe_optimizations.rst b/docs/lazyframe_optimizations.rst new file mode 100644 index 00000000..ffabeea4 --- /dev/null +++ b/docs/lazyframe_optimizations.rst @@ -0,0 +1,236 @@ +Optimized Execution with LazyFrames +=================================== + +LazyFrame separates pipeline definition from execution. That gives LOTUS a +complete logical plan before any expensive LLM calls happen. ``optimize()`` +uses that plan to rewrite execution order, tune prompts, and prepare cascades. + +Why Optimize? +------------- + +LLM data processing pipelines are sensitive to both cost and prompt quality. +A pipeline that is correct on a few examples may become expensive or brittle +at dataset scale. Optimization helps in three ways: + +- **Reduce cost** by doing cheap deterministic work before LM calls. +- **Improve accuracy** by tuning semantic instructions against training data. +- **Reuse learned state** such as cascade thresholds in future runs. + +This matters most for multi-step programs: filtering traces before aggregating +failure modes, judging many model outputs, running RAG over retrieved evidence, +or extracting structured fields from many documents. + +Optimization Flow +----------------- + +LazyFrame optimization has three steps: + +1. Build the pipeline. Nothing executes yet. +2. Call ``optimize()`` with optional optimizers and training data. +3. Call ``execute()`` on the optimized pipeline. + +Original pipeline: + +.. code-block:: python + + from lotus.ast import LazyFrame + + pipeline = LazyFrame().sem_filter( + "The {issue_title} describes a small, self-contained task that a new " + "open source contributor could tackle without deep knowledge of the codebase" + ) + + pipeline.print_tree() + +Updated pipeline: + +.. code-block:: python + + from lotus.ast.optimizer import GEPAOptimizer, CascadeOptimizer + + optimized = pipeline.optimize( + [GEPAOptimizer(eval_fn=eval_fn), CascadeOptimizer()], + train_data=training_issues, + ) + + optimized.print_tree() + result = optimized.execute(issues) + +``pipeline`` is the original logical plan. ``optimized`` is the updated plan +returned by LOTUS after applying the selected optimizers plus default +predicate pushdown. Printing both trees is the easiest way to inspect what +changed before you run the optimized pipeline on the full dataset. + +``optimize()`` returns a new LazyFrame by default. Pass ``inplace=True`` only +when you want to update the existing object. + +Predicate Pushdown +------------------ + +Predicate pushdown moves cheap pandas filters before semantic operators when +that rewrite is safe. It is on by default whenever you call ``optimize()``. +You do not need to include it in the optimizer list. + +This helps because pandas filters are local and inexpensive, while semantic +filters call an LM. If a pandas filter removes half the rows, pushing it before +``sem_filter`` can remove half the LM calls. + +.. code-block:: python + + pipeline = ( + LazyFrame() + .sem_filter("{issue_title} is a good first issue") + .filter(lambda df: df["priority"] != "critical") + ) + + pipeline.print_tree() + optimized = pipeline.optimize() # predicate pushdown still runs + optimized.print_tree() + +Output: + +.. code-block:: text + + # Original plan + Source + sem_filter('{issue_title} is a good first issue') + filter(...) + + # Optimized plan + Source + filter(...) + sem_filter('{issue_title} is a good first issue') + +Disable default optimizers when you need exact original plan order. + +.. code-block:: python + + optimized = pipeline.optimize( + [], + auto_include_default_optimizers=False, + ) + +GEPA Prompt Optimization +------------------------ + +``GEPAOptimizer`` uses `GEPA `_ to tune +natural language instructions using training data and an evaluation function. +This is useful when a high-level prompt is easy to write but not accurate +enough for your metric. + +The evaluation function receives ``(output_df, example)`` and returns either a +score or ``(score, side_info)``. Higher scores are better. ``side_info`` gives +the optimizer diagnostic context, such as precision and recall. + +.. code-block:: python + + from lotus.ast.optimizer import GEPAOptimizer + + GOOD_FIRST_ISSUE_IDS = {0, 3, 5, 7, 9, 11} + + def eval_fn(output_df, example): + kept = set(output_df.index) + true_positive = len(kept & GOOD_FIRST_ISSUE_IDS) + precision = true_positive / max(len(kept), 1) + recall = true_positive / max(len(GOOD_FIRST_ISSUE_IDS), 1) + f1 = 2 * precision * recall / max(precision + recall, 1e-9) + return f1, {"precision": precision, "recall": recall} + + optimizer = GEPAOptimizer( + eval_fn=eval_fn, + objective="Maximize F1 for identifying good first issues.", + ) + + pipeline = LazyFrame().sem_filter( + "{issue_title} is an easy starter task" + ) + + optimized = pipeline.optimize([optimizer], train_data=issues) + +GEPA can optimize instructions on semantic operators such as ``sem_filter``, +``sem_map``, ``sem_agg``, ``sem_topk``, ``sem_join``, ``sem_search``, and the +evaluation operators. The benefit is end-to-end prompt tuning: if a pipeline +has multiple semantic steps, the prompts can be improved together instead of +tuning each operator in isolation. + +Choosing What GEPA Can Change +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Use ``mark_optimizable`` to restrict which parameters GEPA can rewrite. Use an +empty list to pin a node so GEPA leaves it unchanged. + +.. code-block:: python + + pipeline = ( + LazyFrame() + .sem_filter( + "{issue_title} is a good first issue", + mark_optimizable=["user_instruction"], + ) + .sem_map( + "Rewrite {issue_title} as a task title", + suffix="_task", + mark_optimizable=[], + ) + ) + +Cascade Thresholds +------------------ + +Cascades reduce cost by routing easy examples to a cheaper proxy model and +only sending uncertain examples to the main LM. A cascade needs thresholds to +decide what counts as high-confidence. + +``CascadeOptimizer`` runs the pipeline once on training data to learn and store +those thresholds. Later executions reuse the thresholds and skip the learning +pass. + +.. code-block:: python + + import lotus + from lotus.ast import LazyFrame + from lotus.ast.optimizer import CascadeOptimizer + from lotus.models import LM + from lotus.types import CascadeArgs + + lotus.settings.configure( + lm=LM(model="gpt-4o"), + helper_lm=LM(model="gpt-4o-mini"), + ) + + cascade_args = CascadeArgs( + recall_target=0.9, + precision_target=0.9, + sampling_percentage=0.5, + failure_probability=0.2, + ) + + pipeline = LazyFrame().sem_filter( + "{issue_title} is a good first issue", + cascade_args=cascade_args, + ) + + optimized = pipeline.optimize([CascadeOptimizer()], train_data=issues) + result = optimized.execute(issues) + +Use higher ``recall_target`` when missing true positives is costly. Use higher +``precision_target`` when false positives are costly. Higher targets usually +increase main-LM calls. + +Saving Optimized Pipelines +-------------------------- + +Optimized pipelines can be saved and loaded like any LazyFrame. This preserves +optimized prompts and learned cascade thresholds. + +.. code-block:: python + + optimized.save("optimized_lf.pkl") + + loaded = LazyFrame.load("optimized_lf.pkl") + result = loaded.execute(issues) + +API Reference +------------- + +See :doc:`lazyframe_api` for the full LazyFrame and optimizer API reference. diff --git a/docs/llm.rst b/docs/llm.rst index 058d657f..1bfccd52 100644 --- a/docs/llm.rst +++ b/docs/llm.rst @@ -106,7 +106,7 @@ Example setting token rate limits: ) Usage Limits ------------ +------------ The LM class supports setting usage limits to control costs and token consumption. You can set limits on: - Prompt tokens diff --git a/docs/llm_as_judge.rst b/docs/llm_as_judge.rst new file mode 100644 index 00000000..05813b8f --- /dev/null +++ b/docs/llm_as_judge.rst @@ -0,0 +1,154 @@ +LLM as judge +============ + +``llm_as_judge`` evaluates each row with a natural language judge instruction. +Use column references such as ``{answer}`` and ``{question}`` in the +instruction. + +Basic Usage +----------- + +.. code-block:: python + + import pandas as pd + import lotus + from lotus.models import LM + + lotus.settings.configure(lm=LM(model="gpt-4o-mini")) + + df = pd.DataFrame({ + "question": [ + "Explain supervised learning.", + "Explain cross-validation.", + ], + "answer": [ + "Supervised learning trains on labeled examples.", + "Cross-validation evaluates a model on multiple held-out splits.", + ], + }) + + results = df.llm_as_judge( + "Rate the accuracy and completeness of {answer} for {question} " + "from 1 to 10. Return only the score.", + n_trials=2, + ) + + print(results) + +Output Columns +-------------- + +For each trial, LOTUS adds one output column named ``{suffix}_{trial}``. +The default suffix is ``_judge``, so the first trial is ``_judge_0``. + +Set ``return_raw_outputs=True`` to add ``raw_output{suffix}_{trial}``. +Set ``return_explanations=True`` to add ``explanation{suffix}_{trial}``. + +Structured Output +----------------- + +Pass a Pydantic model as ``response_format`` when you want structured judge +outputs. + +.. code-block:: python + + from pydantic import BaseModel, Field + + class Evaluation(BaseModel): + score: int = Field(description="Score from 1 to 10") + reasoning: str = Field(description="Reason for the score") + + results = df.llm_as_judge( + "Evaluate {answer} for {question}.", + response_format=Evaluation, + suffix="_evaluation", + ) + + first = results.loc[0, "_evaluation_0"] + print(first.score) + print(first.reasoning) + +``response_format`` is not supported with ``ReasoningStrategy.COT`` or +``ReasoningStrategy.ZS_COT``. Put reasoning fields in the structured output +model instead. + +Few-Shot Examples +----------------- + +Pass examples with the same input columns and an ``Answer`` column. + +.. code-block:: python + + examples = pd.DataFrame({ + "question": ["What is supervised learning?"], + "answer": ["It uses labeled examples to train a model."], + "Answer": ["9"], + }) + + results = df.llm_as_judge( + "Rate {answer} for {question} from 1 to 10.", + examples=examples, + ) + +If you use ``ReasoningStrategy.COT`` with examples, include a ``Reasoning`` +column in the examples DataFrame. + +Extra Context Columns +--------------------- + +``extra_cols_to_include`` lets you include columns in the judge input even +when they are not referenced directly in the instruction. + +.. code-block:: python + + results = df.llm_as_judge( + "Evaluate the answer: {answer}", + extra_cols_to_include=["question"], + ) + +Parameters +---------- + +.. code-block:: python + + DataFrame.llm_as_judge( + judge_instruction, + response_format=None, + n_trials=1, + system_prompt=None, + postprocessor=map_postprocess, + return_raw_outputs=False, + return_explanations=False, + suffix="_judge", + examples=None, + cot_reasoning=None, + strategy=None, + extra_cols_to_include=None, + safe_mode=False, + progress_bar_desc="Evaluating", + **model_kwargs, + ) + +- ``judge_instruction``: Natural language judge instruction. +- ``response_format``: Optional Pydantic model for structured output. +- ``n_trials``: Number of independent judge trials. +- ``system_prompt``: Optional system prompt for the judge. +- ``postprocessor``: Function that parses raw model outputs. +- ``return_raw_outputs``: Include raw model text columns. +- ``return_explanations``: Include explanation columns. +- ``suffix``: Base suffix for output columns. +- ``examples``: Few-shot examples with an ``Answer`` column. +- ``cot_reasoning``: Reasoning strings for direct function use. +- ``strategy``: Optional reasoning strategy. +- ``extra_cols_to_include``: Extra columns to include in judge inputs. +- ``safe_mode``: Estimate cost before execution. +- ``progress_bar_desc``: Progress bar label. +- ``model_kwargs``: Extra keyword arguments passed to the LM. + +API Reference +------------- + +.. automodule:: lotus.evals.llm_as_judge + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/pairwise_judge.rst b/docs/pairwise_judge.rst new file mode 100644 index 00000000..14e370ab --- /dev/null +++ b/docs/pairwise_judge.rst @@ -0,0 +1,149 @@ +Pairwise judge +============== + +``pairwise_judge`` compares two columns row by row. It returns ``A`` when +``col1`` is better and ``B`` when ``col2`` is better. + +Basic Usage +----------- + +.. code-block:: python + + import pandas as pd + import lotus + from lotus.models import LM + + lotus.settings.configure(lm=LM(model="gpt-4o-mini")) + + df = pd.DataFrame({ + "question": [ + "Explain cross-validation in one sentence.", + "Suggest a subject line for a 1:1 meeting.", + ], + "model_a": [ + "Cross-validation evaluates a model across multiple held-out splits.", + "Meeting request.", + ], + "model_b": [ + "Cross-validation is when the model checks its answers.", + "Requesting time for a 1:1 next week", + ], + }) + + results = df.pairwise_judge( + col1="model_a", + col2="model_b", + judge_instruction="Which response better answers {question}?", + n_trials=2, + permute_cols=True, + ) + + print(results) + +Position Bias Mitigation +------------------------ + +Set ``permute_cols=True`` to run half the trials as ``col1`` versus ``col2`` +and half as ``col2`` versus ``col1``. ``n_trials`` must be even when +``permute_cols=True``. + +.. code-block:: python + + results = df.pairwise_judge( + "model_a", + "model_b", + "Which response is more helpful for {question}?", + n_trials=4, + permute_cols=True, + ) + +Output Columns +-------------- + +For each trial, LOTUS adds one output column named ``{suffix}_{trial}``. +The default suffix is ``_judge``. + +Set ``return_raw_outputs=True`` to include raw model outputs. Set +``return_explanations=True`` to include explanations. + +Cascade Mode +------------ + +``pairwise_judge`` is implemented through semantic filtering and supports +filter cascade options for lower-cost comparisons. + +.. code-block:: python + + from lotus.types import CascadeArgs + + cascade_args = CascadeArgs( + recall_target=0.9, + precision_target=0.9, + sampling_percentage=0.5, + failure_probability=0.2, + ) + + results, stats = df.pairwise_judge( + col1="model_a", + col2="model_b", + judge_instruction="Which response better answers {question}?", + cascade_args=cascade_args, + return_stats=True, + ) + +When ``return_stats=True``, the result is ``(DataFrame, stats)``. + +Parameters +---------- + +.. code-block:: python + + DataFrame.pairwise_judge( + col1, + col2, + judge_instruction, + n_trials=1, + permute_cols=False, + system_prompt=None, + return_raw_outputs=False, + return_explanations=False, + default_to_col1=True, + suffix="_judge", + examples=None, + helper_examples=None, + strategy=None, + cascade_args=None, + return_stats=False, + safe_mode=False, + progress_bar_desc="Evaluating", + additional_cot_instructions="", + **model_kwargs, + ) + +- ``col1``: First response column. Results map this column to ``A``. +- ``col2``: Second response column. Results map this column to ``B``. +- ``judge_instruction``: Natural language comparison criteria. +- ``n_trials``: Number of comparison trials. +- ``permute_cols``: Run both response orders to reduce position bias. +- ``system_prompt``: Optional system prompt for the judge. +- ``return_raw_outputs``: Include raw model text columns. +- ``return_explanations``: Include explanation columns. +- ``default_to_col1``: Default decision when parsing is uncertain. +- ``suffix``: Base suffix for output columns. +- ``examples``: Few-shot examples for the main judge. +- ``helper_examples``: Few-shot examples for the helper LM in cascade mode. +- ``strategy``: Optional reasoning strategy. +- ``cascade_args``: Optional filter cascade configuration. +- ``return_stats``: Return cascade statistics with the DataFrame. +- ``safe_mode``: Estimate cost before execution. +- ``progress_bar_desc``: Progress bar label. +- ``additional_cot_instructions``: Extra CoT instructions for sem-filter mode. +- ``model_kwargs``: Extra keyword arguments passed to the LM. + +API Reference +------------- + +.. automodule:: lotus.evals.pairwise_judge + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/prompt_strategies.rst b/docs/prompt_strategies.rst index 4ccfcc25..99416e6e 100644 --- a/docs/prompt_strategies.rst +++ b/docs/prompt_strategies.rst @@ -106,14 +106,17 @@ Here is another example that uses ZS_COT Supported Reasoning Strategies ------------------------------- -ReasoningStrategy.DEFAULT -* Description: The default strategy. The model receives a plain instruction and is expected to provide a direct answer without any explicit reasoning. +``ReasoningStrategy.DEFAULT`` + The default strategy. The model receives a plain instruction and is + expected to provide a direct answer without explicit reasoning. -ReasoningStrategy.COT (Chain of Thought) -* Description: The model is prompted to reason step-by-step after being shown a few reasoning examples. Useful for tasks that benefit from intermediate steps to improve answer accuracy or interpretability. +``ReasoningStrategy.COT`` + Chain-of-thought prompting with reasoning examples. Useful for tasks that + benefit from intermediate steps. -* ReasoningStrategy.ZS_COT (Zero-Shot Chain of Thought) -Description: The model is instructed to reason step-by-step without examples. The reasoning process is triggered purely by the prompt (e.g., “Let’s think step by step”). +``ReasoningStrategy.ZS_COT`` + Zero-shot chain-of-thought prompting. The model is instructed to reason + step by step without examples. -ReasoningStrategy.FEW_SHOT -* Description: The model is provided with few-shot examples, but without explicit reasoning steps. It imitates the behavior from the examples to answer the final prompt. +``ReasoningStrategy.FEW_SHOT`` + Few-shot prompting without explicit reasoning steps. diff --git a/docs/sem_agg.rst b/docs/sem_agg.rst index 68a8b190..29a87291 100644 --- a/docs/sem_agg.rst +++ b/docs/sem_agg.rst @@ -1,118 +1,100 @@ sem_agg -====================== +======== -Overview ---------- -This operator performs an aggregation over the input relation, with -a langex signature that provides a commutative and associative aggregation function +``sem_agg`` aggregates many rows into one answer. It is useful for +summarization, synthesis, and reasoning across text-heavy DataFrames. Motivation ------------ -Semantic aggregations are useful for tasks, such as summarization and reasoning across multiple rows of the dataset. +---------- +Traditional aggregations compute values such as sums, counts, and averages. +Many language-heavy tasks need a different kind of aggregation: read many +rows, identify the shared themes, and produce one synthesized answer. +Use ``sem_agg`` when the output depends on the dataset as a whole rather than +one row at a time. Common uses include summarizing a collection of documents, +writing a cross-record report, identifying themes across tickets, or producing +one structured summary per group. + +Article Summary Example +----------------------- -Examples ---------- .. code-block:: python import pandas as pd - import lotus - from lotus.models import LM - lm = LM(model="gpt-4o-mini") - lotus.settings.configure(lm=lm) + lotus.settings.configure(lm=LM(model="gpt-4o-mini")) - data = { + articles = pd.DataFrame({ "ArticleTitle": [ "Advancements in Quantum Computing", "Climate Change and Renewable Energy", "The Rise of Artificial Intelligence", - "A Journey into Deep Space Exploration" + "A Journey into Deep Space Exploration", ], "ArticleContent": [ - """Quantum computing harnesses the properties of quantum mechanics - to perform computations at speeds unimaginable with classical machines. - As research and development progress, emerging quantum algorithms show - great promise in solving previously intractable problems.""", - - """Global temperatures continue to rise, and societies worldwide - are turning to renewable resources like solar and wind power to mitigate - climate change. The shift to green technology is expected to reshape - economies and significantly reduce carbon footprints.""", - - """Artificial Intelligence (AI) has grown rapidly, integrating - into various industries. Machine learning models now enable systems to - learn from massive datasets, improving efficiency and uncovering hidden - patterns. However, ethical concerns about privacy and bias must be addressed.""", - - """Deep space exploration aims to understand the cosmos beyond - our solar system. Recent missions focus on distant exoplanets, black holes, - and interstellar objects. Advancements in propulsion and life support systems - may one day enable human travel to far-off celestial bodies.""" - ] - } - - df = pd.DataFrame(data) - - df = df.sem_agg("Provide a concise summary of all {ArticleContent} in a single paragraph, highlighting the key technological progress and its implications for the future.") - print(df._output[0]) + ( + "Quantum computing harnesses the properties of quantum mechanics " + "to perform computations at speeds unimaginable with classical " + "machines. Emerging quantum algorithms show promise in solving " + "previously intractable problems." + ), + ( + "Global temperatures continue to rise, and societies worldwide " + "are turning to renewable resources like solar and wind power. " + "The shift to green technology is expected to reshape economies." + ), + ( + "Artificial Intelligence has grown rapidly across industries. " + "Machine learning models improve efficiency and uncover hidden " + "patterns, while privacy and bias concerns remain important." + ), + ( + "Deep space exploration studies the cosmos beyond our solar " + "system. Recent missions focus on exoplanets, black holes, and " + "interstellar objects." + ), + ], + }) -Output: + summary = articles.sem_agg( + "Provide a concise summary of all {ArticleContent} in a single " + "paragraph, highlighting key technological progress and implications " + "for the future." + ) -.. code-block:: text - - "Recent technological advancements are reshaping various fields and have significant implications for the future. - Quantum computing is emerging as a powerful tool capable of solving complex problems at unprecedented speeds, while the - global shift towards renewable energy sources like solar and wind power aims to combat climate change and transform economies. - In the realm of Artificial Intelligence, rapid growth and integration into industries are enhancing efficiency and revealing - hidden data patterns, though ethical concerns regarding privacy and bias persist. Additionally, deep space exploration is - advancing with missions targeting exoplanets and black holes, potentially paving the way for human travel beyond our solar - system through improved propulsion and life support technologies." - -Example with group-by ---------------------- -.. code-block:: python + print(summary["_output"].iloc[0]) - import pandas as pd - import lotus - from lotus.models import LM +Output: - lm = LM(model="gpt-4o-mini") - lotus.settings.configure(lm=lm) +.. code-block:: text - # Example DataFrame - data = { - "Category": ["Tech", "Env", "Tech", "Env"], - "ArticleContent": [ - "Quantum computing shows promise in solving complex problems.", - "Renewable energy helps mitigate climate change.", - "AI improves efficiency but raises ethical concerns.", - "New holes in the ozone layer have been found." - ] - } - - df = pd.DataFrame(data) - - # Perform semantic aggregation with groupby - df = df.sem_agg( - "Summarize the {ArticleContent} for each {Category}.", - group_by=["Category"] - ) + Recent technological advances are reshaping computation, energy, AI, and + space exploration. Quantum computing may unlock new classes of algorithms, + renewable energy can reduce climate impact and reshape economies, AI is + improving data-driven decision making while raising governance concerns, + and deep-space research is expanding what future missions may make possible. - print(df._output) +The result is a one-row DataFrame. The default output column is ``_output``. -Output: -.. code-block:: text +Grouped Aggregation +------------------- - 0 The "Env" category features two key points: re... - 0 In the Tech category, two key developments are... +Use ``group_by`` to produce one aggregation per group. +.. code-block:: python + grouped = articles.assign( + Category=["Tech", "Env", "Tech", "Space"] + ).sem_agg( + "Summarize the {ArticleContent} for this category.", + group_by=["Category"], + ) +``grouped`` has one output row per category. Long Context Handling ------------------ @@ -120,25 +102,8 @@ When documents exceed the language model's context length, sem_agg supports auto .. code-block:: python - import pandas as pd - import lotus - from lotus.models import LM from lotus.types import LongContextStrategy - # Configure model with smaller context for demonstration - lm = LM(model="gpt-4o-mini", max_ctx_len=2000, max_tokens=200) - lotus.settings.configure(lm=lm) - - # Create DataFrame with potentially large documents - data = { - "title": ["Research Paper", "Blog Post"], - "content": [ - "Very long research content..." * 500, # Exceeds context - "Regular blog post content" - ] - } - df = pd.DataFrame(data) - # Use TRUNCATE strategy (default) - simply cuts off excess content result_truncate = df.sem_agg( "Summarize the key points from {content}", @@ -161,13 +126,56 @@ When documents exceed the language model's context length, sem_agg supports auto - Use **TRUNCATE** when the most important information is at the beginning of documents - Use **CHUNK** when all parts of the document are potentially important and you need to preserve complete information +Structured Output +----------------- + +Pass ``response_format`` when the final answer should follow a Pydantic model +or JSON schema. By default, ``split_fields_into_cols=True`` turns structured +fields into separate DataFrame columns. + +.. code-block:: python + + from pydantic import BaseModel, Field + + class ArticleSummary(BaseModel): + theme: str = Field(description="Main theme across the articles") + future_impact: str = Field(description="Likely future implication") + + structured = articles.sem_agg( + "Summarize the shared theme and future impact of {ArticleContent}.", + response_format=ArticleSummary, + ) + +Set ``split_fields_into_cols=False`` if you want the structured model response +to stay in the output column instead of becoming separate fields. + +Return Value +------------ + +``sem_agg`` returns one row for the full DataFrame or one row per group. With +plain text output, the result column is ``suffix``. With structured output and +``split_fields_into_cols=True``, fields become individual columns. + Required Parameters --------------------- -- **user_instructions** : Prompt to pass into LM +------------------- + +- ``user_instruction``: Natural language aggregation instruction. Reference + columns with ``{column_name}``. Optional Parameters --------------------- -- **all_cols** : Whether to use all columns in the dataframe. -- **suffix** : The suffix for the new column -- **group_by** : The columns to group by before aggregation. Each group will be aggregated separately. -- **long_context_strategy** : Strategy for handling documents that exceed context length (LongContextStrategy.TRUNCATE or LongContextStrategy.CHUNK) +------------------- + +- ``all_cols``: Use all DataFrame columns instead of only columns referenced in + ``user_instruction``. +- ``suffix``: Output column name for plain text output. Defaults to + ``"_output"``. +- ``group_by``: Columns to group by before aggregation. Produces one output row + per group. +- ``safe_mode``: Accepted for API consistency; aggregation safe mode is not + fully implemented. +- ``progress_bar_desc``: Progress bar label. +- ``long_context_strategy``: Strategy for long inputs. Defaults to + ``LongContextStrategy.CHUNK``. +- ``split_fields_into_cols``: Split structured output fields into columns when + ``response_format`` is provided. +- ``response_format``: Pydantic model or JSON schema for structured output. diff --git a/docs/sem_filter.rst b/docs/sem_filter.rst index 0bb2a713..0f66e72d 100644 --- a/docs/sem_filter.rst +++ b/docs/sem_filter.rst @@ -1,9 +1,8 @@ sem_filter -================= +========== -Overview ---------- -sem_filter take a langex predicate, and returns data records that pass the predicate. +``sem_filter`` keeps rows whose contents satisfy a natural language predicate. +Reference DataFrame columns with ``{column_name}``. Motivation ----------- @@ -17,198 +16,161 @@ Filter Example .. code-block:: python import pandas as pd - import lotus from lotus.models import LM - lm = LM(model="gpt-4o-mini") + lotus.settings.configure(lm=LM(model="gpt-4o-mini")) - lotus.settings.configure(lm=lm) - data = { + courses = pd.DataFrame({ "Course Name": [ "Probability and Random Processes", "Optimization Methods in Engineering", "Digital Design and Integrated Circuits", "Computer Security", ] - } - df = pd.DataFrame(data) - user_instruction = "{Course Name} requires a lot of math" - df = df.sem_filter(user_instruction) - print(df) + }) + + math_heavy = courses.sem_filter( + "{Course Name} requires a lot of math" + ) + + print(math_heavy) Output: -+---+---------------------------------------------+ -| | Course Name | -+---+---------------------------------------------+ -| 0 | Probability and Random Processes | -+---+---------------------------------------------+ -| 1 | Optimization Methods in Engineering | -+---+---------------------------------------------+ -| 2 | Digital Design and Integrated Circuits | -+---+---------------------------------------------+ ++---+----------------------------------------+ +| | Course Name | ++===+========================================+ +| 0 | Probability and Random Processes | ++---+----------------------------------------+ +| 1 | Optimization Methods in Engineering | ++---+----------------------------------------+ +| 2 | Digital Design and Integrated Circuits | ++---+----------------------------------------+ + +The result contains only the rows that the model judged as satisfying the +predicate. +Returning Decisions for Every Row +--------------------------------- +By default, ``sem_filter`` drops rows that do not pass. Set +``return_all=True`` when you want to keep every row and add the model's boolean +decision as a new column. -Example of Filter with Approximation ------------------------ .. code-block:: python - import pandas as pd + judged = courses.sem_filter( + "{Course Name} requires a lot of math", + return_all=True, + suffix="_math_heavy", + ) - import lotus - from lotus.models import LM - from lotus.types import CascadeArgs +``judged`` keeps the original rows and adds ``_math_heavy``. +Explanations and Raw Outputs +---------------------------- - gpt_4o_mini = LM("gpt-4o-mini") - gpt_4o = LM("gpt-4o") +Use ``return_explanations=True`` while developing a predicate or auditing the +model's decisions. - lotus.settings.configure(lm=gpt_4o, helper_lm=gpt_4o_mini) - data = { - "Course Name": [ - "Probability and Random Processes", "Optimization Methods in Engineering", "Digital Design and Integrated Circuits", - "Computer Security", "Data Structures and Algorithms", "Machine Learning", "Artificial Intelligence", "Natural Language Processing", - "Introduction to Robotics", "Control Systems", "Linear Algebra and Differential Equations", "Database Systems", "Cloud Computing", - "Software Engineering", "Operating Systems", "Discrete Mathematics", "Numerical Methods", "Wireless Communication Systems", - "Embedded Systems", "Advanced Computer Architecture", "Graph Theory", "Cryptography and Network Security", - "Big Data Analytics", "Deep Learning", "Organic Chemistry", "Molecular Biology", "Environmental Science", - "Genetics and Evolution", "Human Physiology", "Introduction to Anthropology", "Cultural Studies", "Political Theory", - "Macroeconomics", "Microeconomics", "Introduction to Sociology", "Developmental Psychology", "Cognitive Science", - "Introduction to Philosophy", "Ethics and Moral Philosophy", "History of Western Civilization", "Art History: Renaissance to Modern", - "World Literature", "Introduction to Journalism", "Public Speaking and Communication", "Creative Writing", "Music Theory", - "Introduction to Theater", "Film Studies", "Environmental Policy and Law", "Sustainability and Renewable Energy", - "Urban Planning and Design", "International Relations", "Marketing Principles", "Organizational Behavior", - "Financial Accounting", "Corporate Finance", "Business Law", "Supply Chain Management", "Operations Research", - "Entrepreneurship and Innovation", "Introduction to Psychology", "Health Economics", "Biostatistics", - "Social Work Practice", "Public Health Policy", "Environmental Ethics", "History of Political Thought", "Quantitative Research Methods", - "Comparative Politics", "Urban Economics", "Behavioral Economics", "Sociology of Education", "Social Psychology", - "Gender Studies", "Media and Communication Studies", "Advertising and Brand Strategy", - "Sports Management", "Introduction to Archaeology", "Ecology and Conservation Biology", "Marine Biology", - "Geology and Earth Science", "Astronomy and Astrophysics", "Introduction to Meteorology", - "Introduction to Oceanography", "Quantum Physics", "Thermodynamics", "Fluid Mechanics", "Solid State Physics", - "Classical Mechanics", "Introduction to Civil Engineering", "Material Science and Engineering", "Structural Engineering", - "Environmental Engineering", "Energy Systems Engineering", "Aerodynamics", "Heat Transfer", - "Renewable Energy Systems", "Transportation Engineering", "Water Resources Management", "Principles of Accounting", - "Project Management", "International Business", "Business Analytics", - ] - } - df = pd.DataFrame(data) - user_instruction = "{Course Name} requires a lot of math" +.. code-block:: python - cascade_args = CascadeArgs(recall_target=0.9, precision_target=0.9, sampling_percentage=0.5, failure_probability=0.2) + judged = courses.sem_filter( + "{Course Name} requires a lot of math", + return_all=True, + return_explanations=True, + return_raw_outputs=True, + ) - df, stats = df.sem_filter(user_instruction=user_instruction, cascade_args=cascade_args, return_stats=True) - print(df) - print(stats) +When ``return_all=False``, explanations and raw outputs are returned only for +the rows that pass. When ``return_all=True``, they are returned for all rows. -Output: +Reasoning and Custom Instructions +--------------------------------- + +Reasoning strategies can improve difficult filters by asking the model to work +through the decision before producing ``True`` or ``False``. + +.. code-block:: python -+-----+---------------------------------------------+ -| | Course Name | -+-----+---------------------------------------------+ -| 0 | Probability and Random Processes | -+-----+---------------------------------------------+ -| 1 | Optimization Methods in Engineering | -+-----+---------------------------------------------+ -| 2 | Digital Design and Integrated Circuits | -+-----+---------------------------------------------+ -| 5 | Machine Learning | -+-----+---------------------------------------------+ -| 6 | Artificial Intelligence | -+-----+---------------------------------------------+ -| 7 | Natural Language Processing | -+-----+---------------------------------------------+ -| 8 | Introduction to Robotics | -+-----+---------------------------------------------+ -| 9 | Control Systems | -+-----+---------------------------------------------+ -| 10 | Linear Algebra and Differential Equations | -+-----+---------------------------------------------+ -| 15 | Discrete Mathematics | -+-----+---------------------------------------------+ -| 16 | Numerical Methods | -+-----+---------------------------------------------+ -| 17 | Wireless Communication Systems | -+-----+---------------------------------------------+ -| 19 | Advanced Computer Architecture | -+-----+---------------------------------------------+ -| 20 | Graph Theory | -+-----+---------------------------------------------+ -| 21 | Cryptography and Network Security | -+-----+---------------------------------------------+ -| 22 | Big Data Analytics | -+-----+---------------------------------------------+ -| 23 | Deep Learning | -+-----+---------------------------------------------+ -| 33 | Microeconomics | -+-----+---------------------------------------------+ -| 55 | Corporate Finance | -+-----+---------------------------------------------+ -| 58 | Operations Research | -+-----+---------------------------------------------+ -| 61 | Health Economics | -+-----+---------------------------------------------+ -| 62 | Biostatistics | -+-----+---------------------------------------------+ -| 67 | Quantitative Research Methods | -+-----+---------------------------------------------+ -| 69 | Urban Economics | -+-----+---------------------------------------------+ -| 81 | Astronomy and Astrophysics | -+-----+---------------------------------------------+ -| 84 | Quantum Physics | -+-----+---------------------------------------------+ -| 85 | Thermodynamics | -+-----+---------------------------------------------+ -| 86 | Fluid Mechanics | -+-----+---------------------------------------------+ -| 87 | Solid State Physics | -+-----+---------------------------------------------+ -| 88 | Classical Mechanics | -+-----+---------------------------------------------+ -| 89 | Introduction to Civil Engineering | -+-----+---------------------------------------------+ -| 90 | Material Science and Engineering | -+-----+---------------------------------------------+ -| 91 | Structural Engineering | -+-----+---------------------------------------------+ -| 92 | Environmental Engineering | -+-----+---------------------------------------------+ -| 93 | Energy Systems Engineering | -+-----+---------------------------------------------+ -| 94 | Aerodynamics | -+-----+---------------------------------------------+ -| 95 | Heat Transfer | -+-----+---------------------------------------------+ -| 96 | Renewable Energy Systems | -+-----+---------------------------------------------+ -| 97 | Transportation Engineering | -+-----+---------------------------------------------+ -| 102 | Business Analytics | -+-----+---------------------------------------------+ - -Output Statistics: - -{'pos_cascade_threshold': 0.62, 'neg_cascade_threshold': 0.58, 'filters_resolved_by_helper_model': 101, 'filters_resolved_by_large_model': 2, 'num_routed_to_helper_model': 101} + from lotus.types import ReasoningStrategy + filtered = issues.sem_filter( + "{issue_title} is a small, self-contained task for a new contributor", + strategy=ReasoningStrategy.ZS_COT, + additional_cot_instructions="Focus on codebase knowledge and blast radius.", + ) + +``system_prompt`` changes the model's role for the filter. ``output_tokens`` +changes the positive and negative labels, which defaults to ``("True", +"False")``. + +Cascades +-------- + +Cascades reduce cost by using a cheaper helper first and routing uncertain +rows to the main LM. See :doc:`approximation_cascades` for the full details. + +.. code-block:: python + + from lotus.types import CascadeArgs, ProxyModel + + lotus.settings.configure( + lm=LM(model="gpt-4o"), + helper_lm=LM(model="gpt-4o-mini"), + ) + + cascade_args = CascadeArgs( + recall_target=0.9, + precision_target=0.9, + sampling_percentage=0.5, + failure_probability=0.2, + proxy_model=ProxyModel.HELPER_LM, + helper_filter_instruction="{issue_title} is easy for a new contributor", + ) + + filtered, stats = issues.sem_filter( + "{issue_title} is a good first issue", + cascade_args=cascade_args, + return_stats=True, + ) + +``helper_filter_instruction`` can be simpler than the main instruction. If it +is omitted, the helper LM uses the main instruction. + +Return Value +------------ + +Without ``return_stats``, ``sem_filter`` returns a DataFrame. With +``return_stats=True`` and a cascade, it returns ``(df, stats)``. The stats +describe learned thresholds and how many rows were resolved by the helper +versus the main LM. Required Parameters ---------------------- -- **user_instruction** : The user instruction for filtering. +------------------- + +- ``user_instruction``: Natural language predicate. Rows where the predicate is + judged true are kept. Reference columns with ``{column_name}``. Optional Parameters ----------------------- -- **return_raw_outputs** : Whether to return raw outputs. Defaults to False. -- **default** : The default value for filtering in case of parsing errors. Defaults to True. -- **suffix** : The suffix for the new columns. Defaults to "_filter". -- **examples** : The examples dataframe. Defaults to None. -- **helper_examples** : The helper examples dataframe. Defaults to None. -- **strategy** : The reasoning strategy. Defaults to None. -- **cascade_args** : The arguments for join cascade. Defaults to None. - recall_target : The target recall. Defaults to None. - precision_target : The target precision when cascading. Defaults to None. - sampling_percentage : The percentage of the data to sample when cascading. Defaults to 0.1. - failure_probability : The failure probability when cascading. Defaults to 0.2. -- **return_stats** : Whether to return statistics. Defaults to False. \ No newline at end of file +------------------- + +- ``return_raw_outputs``: Add raw model text columns. +- ``return_explanations``: Add explanation columns when available. +- ``return_all``: Keep all rows and add the boolean decision column instead of + dropping false rows. +- ``default``: Boolean decision to use when output parsing is uncertain. +- ``suffix``: Output column suffix when ``return_all=True``. +- ``examples``: Few-shot examples for the main LM with an ``Answer`` column. +- ``helper_examples``: Few-shot examples for the helper LM in cascade mode. +- ``strategy``: Optional reasoning strategy. +- ``cascade_args``: Optional cascade configuration. +- ``return_stats``: Return ``(DataFrame, stats)`` when stats are available. +- ``safe_mode``: Estimate cost before execution. +- ``progress_bar_desc``: Progress bar label. +- ``additional_cot_instructions``: Extra instructions for CoT prompting. +- ``system_prompt``: Custom system prompt for the LM. +- ``output_tokens``: Positive and negative output tokens. Defaults to + ``("True", "False")``. +- ``**model_kwargs``: Extra keyword arguments passed to the configured LM. diff --git a/docs/sem_join.rst b/docs/sem_join.rst index ac1eed12..fdfa0c85 100644 --- a/docs/sem_join.rst +++ b/docs/sem_join.rst @@ -1,9 +1,8 @@ sem_join -================= +======== -Overview ----------- -The sem_join operator in joins to datasets according to the langex, which specifies a predicate in natural language. +``sem_join`` joins two DataFrames, or a DataFrame and a named Series, using a +natural language predicate instead of an equality condition. Motivation ----------- @@ -17,14 +16,12 @@ Join Example .. code-block:: python import pandas as pd - import lotus from lotus.models import LM - lm = LM(model="gpt-4o-mini") + lotus.settings.configure(lm=LM(model="gpt-4o-mini")) - lotus.settings.configure(lm=lm) - data = { + courses = pd.DataFrame({ "Course Name": [ "History of the Atlantic World", "Riemannian Geometry", @@ -33,127 +30,147 @@ Join Example "Compilers", "Intro to computer science", ] - } + }) + + skills = pd.DataFrame({ + "Skill": ["Math", "Computer Science"], + }) - data2 = {"Skill": ["Math", "Computer Science"]} + joined = courses.sem_join( + skills, + "Taking {Course Name:left} will help me learn {Skill:right}", + ) - df1 = pd.DataFrame(data) - df2 = pd.DataFrame(data2) - join_instruction = "Taking {Course Name:left} will help me learn {Skill:right}" - res = df1.sem_join(df2, join_instruction) - print(res) + print(joined) Output: -+---+----------------------------+-------------------+ -| | Course Name | Skill | -+---+----------------------------+-------------------+ -| 1 | Riemannian Geometry | Math | -+---+----------------------------+-------------------+ -| 2 | Operating Systems | Computer Science | -+---+----------------------------+-------------------+ -| 4 | Compilers | Computer Science | -+---+----------------------------+-------------------+ -| 5 | Intro to computer science | Computer Science | -+---+----------------------------+-------------------+ ++---+---------------------------+------------------+ +| | Course Name | Skill | ++===+===========================+==================+ +| 0 | Riemannian Geometry | Math | ++---+---------------------------+------------------+ +| 1 | Operating Systems | Computer Science | ++---+---------------------------+------------------+ +| 2 | Compilers | Computer Science | ++---+---------------------------+------------------+ +| 3 | Intro to computer science | Computer Science | ++---+---------------------------+------------------+ +The result contains the matched course-skill pairs. +Column Disambiguation +--------------------- + +Use ``:left`` and ``:right`` when a join instruction references columns from +both sides. -Example of Join with Approximation ----------------------- .. code-block:: python - import pandas as pd + joined = left.sem_join( + right, + "{title:left} and {title:right} describe the same task", + ) + +If there is no ambiguity, LOTUS can infer the left and right columns from the +DataFrame schemas. If a referenced column exists in both DataFrames, use +explicit ``:left`` and ``:right`` suffixes. + +Join Semantics +-------------- + +``sem_join`` currently supports inner joins. ``other`` can be a DataFrame or a +named Series. For each candidate pair, LOTUS evaluates the natural language +predicate and keeps the pairs judged true. + +Set ``return_explanations=True`` to add an ``explanation{suffix}`` column for +the pairs that matched. + +.. code-block:: python + + joined = courses.sem_join( + skills, + "Taking {Course Name:left} will help me learn {Skill:right}", + return_explanations=True, + suffix="_match", + ) + +Cascades +-------- + +Cascades reduce cost by using cheaper helper plans before routing uncertain +pairs to the main LM. See :doc:`approximation_cascades` for the full details. + +.. code-block:: python - import lotus - from lotus.models import LM, SentenceTransformersRM from lotus.types import CascadeArgs - from lotus.vector_store import FaissVS - lm = LM(model="gpt-4o-mini") - rm = SentenceTransformersRM(model="intfloat/e5-base-v2") - vs = FaissVS() + cascade_args = CascadeArgs( + recall_target=0.7, + precision_target=0.7, + sampling_percentage=0.2, + failure_probability=0.2, + ) - lotus.settings.configure(lm=lm, rm=rm, vs=vs) - data = { - "Course Name": [ - "Digital Design and Integrated Circuits", - "Data Structures and Algorithms", - "The History of Art", - "Natural Language Processing", - ] - } - - skills = [ - "Math", "Computer Science", "Management", "Creative Writing", "Data Analysis", "Machine Learning", "Project Management", - "Problem Solving", "Singing", "Critical Thinking", "Public Speaking", "Teamwork", "Adaptability", "Programming", - "Leadership", "Time Management", "Negotiation", "Decision Making", "Networking", "Painting", - "Customer Service", "Marketing", "Graphic Design", "Nursery", "SEO", "Content Creation", "Video Editing", "Sales", - "Financial Analysis", "Accounting", "Event Planning", "Foreign Languages", "Software Development", "Cybersecurity", - "Social Media Management", "Photography", "Writing & Editing", "Technical Support", "Database Management", "Web Development", - "Business Strategy", "Operations Management", "UI/UX Design", "Reinforcement Learning", "Data Visualization", - "Product Management", "Cloud Computing", "Agile Methodology", "Blockchain", "IT Support", "Legal Research", "Supply Chain Management", - "Copywriting", "Human Resources", "Quality Assurance", "Medical Research", "Healthcare Management", "Sports Coaching", - "Editing & Proofreading", "Legal Writing", "Human Anatomy", "Chemistry", "Physics", "Biology", - "Psychology", "Sociology", "Anthropology", "Political Science", "Public Relations", "Fashion Design", "Interior Design", - "Automotive Repair", "Plumbing", "Carpentry", "Electrical Work", "Welding", "Electronics", "Hardware Engineering", - "Circuit Design", "Robotics", "Environmental Science", "Marine Biology", "Urban Planning", "Geography", - "Agricultural Science", "Animal Care", "Veterinary Science", "Zoology", "Ecology", "Botany", "Landscape Design", - "Baking & Pastry", "Culinary Arts", "Bartending", "Nutrition", "Dietary Planning", "Physical Training", "Yoga", - ] - data2 = pd.DataFrame({"Skill": skills}) - - - df1 = pd.DataFrame(data) - df2 = pd.DataFrame(data2) - join_instruction = "By taking {Course Name:left} I will learn {Skill:right}" - - cascade_args = CascadeArgs(recall_target=0.7, precision_target=0.7) - res, stats = df1.sem_join(df2, join_instruction, cascade_args=cascade_args, return_stats=True) - - - print(f"Joined {df1.shape[0]} rows from df1 with {df2.shape[0]} rows from df2") - print(f" Join cascade took {stats['join_resolved_by_large_model']} LM calls") - print(f" Helper resolved {stats['join_resolved_by_helper_model']} LM calls") - print(f"Join cascade used {stats['total_LM_calls']} LM calls in total") - print(f"Naive join would require {df1.shape[0]*df2.shape[0]} LM calls") - print(res) + joined, stats = courses.sem_join( + skills, + "Taking {Course Name:left} will help me learn {Skill:right}", + cascade_args=cascade_args, + return_stats=True, + ) -Output: +For join cascades, ``CascadeArgs`` can also include ``map_instruction`` and +``map_examples``. -+---+----------------------------------------+----------------------+ -| | Course Name | Skill | -+---+----------------------------------------+----------------------+ -| 0 | Digital Design and Integrated Circuits | Circuit Design | -+---+----------------------------------------+----------------------+ -| 3 | Natural Language Processing | Machine Learning | -+---+----------------------------------------+----------------------+ -| 1 | Data Structures and Algorithms | Computer Science | -+---+----------------------------------------+----------------------+ -| 0 | Digital Design and Integrated Circuits | Electronics | -+---+----------------------------------------+----------------------+ -| 0 | Digital Design and Integrated Circuits | Hardware Engineering | -+---+----------------------------------------+----------------------+ +Few-Shot Examples +----------------- +Use ``examples`` when the join relationship is domain-specific. The examples +DataFrame should contain the referenced left and right columns plus an +``Answer`` column with boolean labels. + +.. code-block:: python + + examples = pd.DataFrame({ + "Course Name": ["Operating Systems"], + "Skill": ["Computer Science"], + "Answer": [True], + }) + + joined = courses.sem_join( + skills, + "Taking {Course Name:left} will help me learn {Skill:right}", + examples=examples, + ) + +Return Value +------------ + +``sem_join`` returns an inner-join DataFrame containing the matched left and +right rows. Columns that exist on both sides are renamed with ``:left`` and +``:right``. With ``return_stats=True`` and a cascade, it returns +``(joined_df, stats)``. Required Parameters ----------------------- -- **other** : The other dataframe or series to join with. -- **join_instruction** : The user instruction for join. +------------------- + +- ``other``: Right-hand DataFrame or named Series. +- ``join_instruction``: Natural language predicate over left and right rows. + Use ``:left`` and ``:right`` to disambiguate columns when needed. Optional Parameters ----------------------- -- **return_explanations** : Whether to return explanations. Defaults to False. -- **how** : The type of join to perform. Defaults to "inner". -- **suffix** : The suffix for the new columns. Defaults to "_join". -- **examples** : The examples dataframe. Defaults to None. -- **strategy** : The reasoning strategy. Defaults to None. -- **default** : The default value for the join in case of parsing errors. Defaults to True. -- **cascade_args**: The arguments for join cascade. Defaults to None. - recall_target : The target recall. Defaults to None. - precision_target : The target precision when cascading. Defaults to None. - sampling_percentage : The percentage of the data to sample when cascading. Defaults to 0.1. - failure_probability : The failure probability when cascading. Defaults to 0.2. - map_instruction : The map instruction when cascading. Defaults to None. - map_examples : The map examples when cascading. Defaults to None. -- **return_stats** : Whether to return stats. Defaults to False. \ No newline at end of file +------------------- + +- ``return_explanations``: Add an ``explanation{suffix}`` column for matched + pairs. +- ``how``: Join type. Only ``"inner"`` is currently supported. +- ``suffix``: Suffix for explanation columns. +- ``examples``: Few-shot examples with referenced columns and an ``Answer`` + column. +- ``strategy``: Optional reasoning strategy. +- ``default``: Boolean decision to use when output parsing is uncertain. +- ``cascade_args``: Optional join cascade configuration. +- ``return_stats``: Return ``(DataFrame, stats)`` when cascade stats are + available. +- ``safe_mode``: Estimate cost before execution. +- ``progress_bar_desc``: Progress bar label. diff --git a/docs/sem_map.rst b/docs/sem_map.rst index 6b8cf448..3e6c1ec2 100644 --- a/docs/sem_map.rst +++ b/docs/sem_map.rst @@ -1,5 +1,5 @@ sem_map -================= +======== Overview ---------- @@ -10,67 +10,127 @@ Motivation The sem_map operator is useful for performing row-wise transformations over data using natural language instructions. It enables users to apply complex mappings, transformations, or analyses without writing custom code, making it ideal for tasks like content summarization, sentiment analysis, format conversion, and data enrichment. Basic Example ----------- +------------- + .. code-block:: python import pandas as pd import lotus from lotus.models import LM - lm = LM(model="gpt-4o-mini") - - lotus.settings.configure(lm=lm) - data = { - "Course Name": [ - "Probability and Random Processes", - "Optimization Methods in Engineering", - "Digital Design and Integrated Circuits", - "Computer Security", - ] - } - df = pd.DataFrame(data) - user_instruction = "What is a similar course to {Course Name}. Be concise." - df = df.sem_map(user_instruction) - print(df) + lotus.settings.configure(lm=LM(model="gpt-4o-mini")) + + courses = pd.DataFrame({ + "Course Name": [ + "Probability and Random Processes", + "Optimization Methods in Engineering", + "Digital Design and Integrated Circuits", + "Computer Security", + ] + }) + + mapped = courses.sem_map( + "What is a similar course to {Course Name}? Be concise.", + suffix="_similar_course", + ) + + print(mapped) Output: -+---+----------------------------------------+----------------------------------------------------------------+ -| | Course Name | _map | -+===+========================================+================================================================+ -| 0 | Probability and Random Processes | A similar course to "Probability and Random Processes"... | -+---+----------------------------------------+----------------------------------------------------------------+ -| 1 | Optimization Methods in Engineering | A similar course to "Optimization Methods in Engineering"... | -+---+----------------------------------------+----------------------------------------------------------------+ -| 2 | Digital Design and Integrated Circuits | A similar course to "Digital Design and Integrated Circuits"...| -+---+----------------------------------------+----------------------------------------------------------------+ -| 3 | Computer Security | A similar course to "Computer Security" is "Cybersecurity"... | -+---+----------------------------------------+----------------------------------------------------------------+ ++---+----------------------------------------+-----------------------+ +| | Course Name | _similar_course | ++===+========================================+=======================+ +| 0 | Probability and Random Processes | Stochastic Processes | ++---+----------------------------------------+-----------------------+ +| 1 | Optimization Methods in Engineering | Convex Optimization | ++---+----------------------------------------+-----------------------+ +| 2 | Digital Design and Integrated Circuits | Computer Architecture | ++---+----------------------------------------+-----------------------+ +| 3 | Computer Security | Cybersecurity | ++---+----------------------------------------+-----------------------+ + +Few-Shot Examples +----------------- + +Use ``examples`` when you want to show the model the desired style or output +format. The examples DataFrame should include the referenced input columns and +an ``Answer`` column. + +.. code-block:: python + + examples = pd.DataFrame({ + "issue_title": ["Fix typo in README"], + "Answer": ["Correct a typo in the README file."], + }) + + mapped = issues.sem_map( + "Rewrite {issue_title} as a concise contributor task.", + examples=examples, + suffix="_task", + ) + +Reasoning and Explanations +-------------------------- + +Reasoning strategies ask the model to reason before producing the final +answer. Use them when the mapping requires judgment, such as classifying an +issue into a category or deciding whether text implies a risk. + +.. code-block:: python + + from lotus.types import ReasoningStrategy + + mapped = issues.sem_map( + "Classify {issue_title} as docs, frontend, security, or infrastructure.", + strategy=ReasoningStrategy.ZS_COT, + return_explanations=True, + suffix="_category", + ) + +``return_explanations=True`` adds ``explanation_category``. This is useful +while developing prompts, but it costs extra output tokens and is usually not +needed in production pipelines. + +Raw Outputs and Postprocessing +------------------------------ + +LOTUS normally stores the parsed model output in the ``suffix`` column. Set +``return_raw_outputs=True`` when you also want the unparsed text returned by +the model. + +.. code-block:: python + + mapped = issues.sem_map( + "Return a priority for {issue_title}: low, medium, or high.", + return_raw_outputs=True, + suffix="_priority", + ) + +Use a custom ``postprocessor`` when the model output needs custom parsing. +The postprocessor receives the raw model outputs and returns parsed outputs, +raw outputs, and optional explanations. Required Parameters ---------------------- -- **user_instruction** (str): The natural language instruction that guides the mapping process. Should describe how to transform each row. Column names can be referenced using curly braces, e.g., "{column_name}". +------------------- + +- ``user_instruction``: Natural language instruction for the row-wise + transformation. Reference columns with ``{column_name}``. Optional Parameters ---------------------- -- **system_prompt** (str | None): Custom system prompt to use. Defaults to None. -- **postprocessor** (Callable): Function to post-process model outputs. Should take (outputs, model, use_cot) and return SemanticMapPostprocessOutput. Defaults to map_postprocess. -- **return_explanations** (bool): Whether to include explanations in the output DataFrame. Useful for debugging and understanding model reasoning. Defaults to False. -- **return_raw_outputs** (bool): Whether to include raw model outputs in the output DataFrame. Useful for debugging. Defaults to False. -- **suffix** (str): The suffix for the output column names. Defaults to "_map". -- **examples** (pd.DataFrame | None): Example DataFrame for few-shot learning. Should have the same column structure as the input DataFrame plus an "Answer" column. Defaults to None. -- **strategy** (ReasoningStrategy | None): The reasoning strategy to use. Can be None, COT (Chain-of-Thought), or ZS_COT (Zero-Shot Chain-of-Thought). Defaults to None. -- **safe_mode** (bool): Whether to enable safe mode with cost estimation before execution. Defaults to False. -- **progress_bar_desc** (str): Description for the progress bar. Defaults to "Mapping". -- **model_kwargs**: Additional keyword arguments to pass to the language model. - - -Return Types and Output Structure ----------------------------------- - -The sem_map operator returns a DataFrame with the following columns: - -- **Original columns**: All original DataFrame columns are preserved -- **{suffix}**: The main output column (default suffix is "_map") -- **explanation{suffix}**: Explanations column (when return_explanations=True) -- **raw_output{suffix}**: Raw model outputs (when return_raw_outputs=True) +------------------- + +- ``system_prompt``: Custom system prompt for the LM. +- ``postprocessor``: Function that parses raw model outputs. +- ``return_explanations``: Add an ``explanation{suffix}`` column when + reasoning is available. +- ``return_raw_outputs``: Add a ``raw_output{suffix}`` column with the raw + model text. +- ``suffix``: Name of the main output column. Defaults to ``"_map"``. +- ``examples``: Few-shot examples with the referenced columns and an + ``Answer`` column. +- ``strategy``: Optional reasoning strategy, such as + ``ReasoningStrategy.ZS_COT``. +- ``safe_mode``: Estimate cost before execution. +- ``progress_bar_desc``: Progress bar label. +- ``**model_kwargs``: Extra keyword arguments passed to the configured LM. diff --git a/docs/usage.rst b/docs/usage.rst index a9b53a1d..08572323 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -1,25 +1,25 @@ Tracking LM Usage -======= +================= -Print and Reseet LM Usage Stats ---------- +Print and Reset LM Usage Stats +------------------------------ To track usage of the LLM you've configured, you can simply access the built-in ``print_total_usage`` .. code-block:: python -lotus.settings.lm.print_total_usage() + lotus.settings.lm.print_total_usage() You can also reset the LLM usage stats as follows: .. code-block:: python -lotus.settings.lm.reset_stats() + lotus.settings.lm.reset_stats() Setting Usage Limits ------------ +-------------------- As a safety measure, the LM class supports setting usage limits to control costs and token consumption. You can set limits on: - Prompt tokens @@ -50,4 +50,4 @@ Example setting usage limits: course_df = course_df.sem_filter("What {Course Name} requires a lot of math") except LotusUsageLimitException as e: print(f"Usage limit exceeded: {e}") - # Handle the exception as needed \ No newline at end of file + # Handle the exception as needed diff --git a/docs/vector_store.rst b/docs/vector_store.rst index 162dc196..9eb0cd45 100644 --- a/docs/vector_store.rst +++ b/docs/vector_store.rst @@ -1,10 +1,10 @@ Vector Stores -===================== +============= Lotus supports multiple vector store backends for efficient semantic indexing and search. This document describes how to use and configure the available vector stores, including Qdrant, Faiss, and Weaviate. Supported Vector Stores ----------------------- +----------------------- - QdrantVS - FaissVS - WeaviateVS diff --git a/docs/web_extract.rst b/docs/web_extract.rst new file mode 100644 index 00000000..ffbbb0e1 --- /dev/null +++ b/docs/web_extract.rst @@ -0,0 +1,97 @@ +web_extract +============ + +``web_extract`` extracts full text from URLs or corpus-specific document IDs +and returns the results as a pandas DataFrame. Use it after :doc:`web_search` +when search results point to documents you want to process, or use it directly +when you already know the document IDs or URLs. + +Basic Extraction +---------------- + +``doc_ids`` and ``urls`` each accept either a string or a list of strings. The +result has ``id``, ``url``, and ``full_text`` columns. + +.. code-block:: python + + from lotus import WebSearchCorpus, web_extract + + df = web_extract( + WebSearchCorpus.ARXIV, + doc_ids="2303.08774", + ) + + print(df[["id", "url", "full_text"]]) + +Extract Multiple Documents +-------------------------- + +.. code-block:: python + + df = web_extract( + WebSearchCorpus.TAVILY, + urls=[ + "https://en.wikipedia.org/wiki/Artificial_intelligence", + "https://en.wikipedia.org/wiki/Machine_learning", + ], + max_length=20_000, + ) + +When the provider supports batching, LOTUS sends one batched request. +Otherwise it fetches each identifier separately. ``delay`` controls the pause +between non-batched fetches. + +Document IDs and URLs +--------------------- + +For arXiv and PubMed, ``doc_ids`` are converted to canonical document URLs. +For other corpora, ``doc_ids`` are treated as URLs. Passing ``urls`` always +uses the given URL directly. + +.. code-block:: python + + pubmed = web_extract( + WebSearchCorpus.PUBMED, + doc_ids=["12345678", "23456789"], + ) + + page = web_extract( + WebSearchCorpus.YOU, + urls="https://example.com/article", + ) + +Using Extracted Text +-------------------- + +The returned DataFrame works with semantic operators and LazyFrames. For +example, you can extract papers and then summarize their full text. + +.. code-block:: python + + papers = web_extract( + WebSearchCorpus.ARXIV, + doc_ids=["2407.11418", "2309.06180"], + max_length=40_000, + ) + + summary = papers.sem_agg( + "Summarize the shared technical themes across {full_text}." + ) + +Parameters +---------- + +.. code-block:: python + + web_extract( + corpus, + doc_ids=None, + urls=None, + max_length=None, + delay=0.1, + ) + +API Reference +------------- + +.. autofunction:: lotus.web_search.web_extract diff --git a/docs/web_search.rst b/docs/web_search.rst index e94d14ae..d4f7f724 100644 --- a/docs/web_search.rst +++ b/docs/web_search.rst @@ -1,267 +1,139 @@ web_search -======================== +=========== -Overview ---------- -The `web_search` function allows you to load documents from the web, then process that data with LOTUS. +``web_search`` loads web search results into a pandas DataFrame. Use it when +you need a tabular set of search results before applying semantic operators, +pandas transformations, or a LazyFrame pipeline. -Different search engines are supported, including Google, Google Scholar, Arxiv, You.com and Tavily. +Use :doc:`web_extract` when you already have URLs or corpus-specific document +IDs and want the full text. -Arxiv Example --------- -To get started, you will need to install the lotus submodule as follows: -.. code-block:: shell - pip install lotus[arxiv] +Supported corpora are: -Then you can run your lotus program: +- ``WebSearchCorpus.GOOGLE`` +- ``WebSearchCorpus.GOOGLE_SCHOLAR`` +- ``WebSearchCorpus.ARXIV`` +- ``WebSearchCorpus.YOU`` +- ``WebSearchCorpus.TAVILY`` +- ``WebSearchCorpus.PUBMED`` +- ``WebSearchCorpus.BING``; Bing is discontinued and raises a deprecation + warning in the current implementation. -.. code-block:: python - - import lotus - from lotus import WebSearchCorpus, web_search - from lotus.models import LM - - lm = LM(model="gpt-4o-mini") - - lotus.settings.configure(lm=lm) - - df = web_search(WebSearchCorpus.ARXIV, "deep learning", 5)[["title", "abstract"]] - print(f"Results from Arxiv\n{df}\n\n") - - most_interesting_articles = df.sem_topk("Which {abstract} is most exciting?", K=1) - print(f"Most interesting article: \n{most_interesting_articles.iloc[0]}") - -Google Example --------- -Before running the following example, you need to set the `SERPAPI_API_KEY` environment variable. You will also need to install the lotus submodule as follows: -.. code-block:: shell - pip install lotus[serpapi] - -Then you can run your lotus program: - -.. code-block:: python - - import lotus - from lotus import WebSearchCorpus, web_search - from lotus.models import LM - - lm = LM(model="gpt-4o-mini") - - lotus.settings.configure(lm=lm) +Basic Search +------------ - df = web_search(WebSearchCorpus.GOOGLE, "deep learning research", 5)[["title", "snippet"]] - print(f"Results from Google\n{df}") - most_interesting_articles = df.sem_topk("Which {snippet} is the most exciting?", K=1) - print(f"Most interesting articles\n{most_interesting_articles}") - -You.com Example --------- -Before running the following example, you need to set the `YOU_API_KEY` environment variable. You will also need to install the lotus submodule as follows: -.. code-block:: shell - pip install lotus[you] - -Then you can run your lotus program: +``web_search`` accepts one query or a list of queries and returns one DataFrame +with a ``query`` column. .. code-block:: python - import lotus from lotus import WebSearchCorpus, web_search - from lotus.models import LM - - lm = LM(model="gpt-4o-mini") - - lotus.settings.configure(lm=lm) - - df = web_search(WebSearchCorpus.YOU, "latest AI breakthroughs", 10)[["title", "snippet"]] - print(f"Results from You.com:\n{df}\n") - top_you_articles = df.sem_topk("Which {snippet} is the most groundbreaking?", K=3) - print(f"Top 3 most interesting articles from You.com:\n{top_you_articles}") + df = web_search( + WebSearchCorpus.ARXIV, + query="lazy dataframe query optimization", + K=5, + ) -Tavily Example --------- -Before running the following example, you need to set the `TAVILY_API_KEY` environment variable. You will also need to install the lotus submodule as follows: -.. code-block:: shell - pip install lotus[tavily] + print(df[["title", "abstract", "query"]]) -Then you can run your lotus program: +Search Multiple Queries +----------------------- .. code-block:: python - import lotus - from lotus import WebSearchCorpus, web_search - from lotus.models import LM - - lm = LM(model="gpt-4o-mini") - - lotus.settings.configure(lm=lm) - - df = web_search(WebSearchCorpus.TAVILY, "AI ethics in 2025", 10)[["title", "summary"]] - print(f"Results from Tavily:\n{df}\n") - top_tavily_articles = df.sem_topk("Which {summary} best explains ethical concerns in AI?", K=3) - print(f"Top 3 articles from Tavily on AI ethics:\n{top_tavily_articles}") + df = web_search( + WebSearchCorpus.PUBMED, + query=[ + "large language models clinical summarization", + "retrieval augmented generation medicine", + ], + K=3, + ) +Date Filtering +-------------- -Date Filtering Example --------------------- -You can filter search results by date range using the ``start_date`` and ``end_date`` parameters: +``start_date`` and ``end_date`` filter results for Google, Google Scholar, +arXiv, You.com, Tavily, and PubMed. ``sort_by_date`` is supported for arXiv. .. code-block:: python from datetime import datetime from lotus import WebSearchCorpus, web_search - # Search for papers published in 2024 - start = datetime(2024, 1, 1) - end = datetime(2024, 12, 31) - - df = web_search( - WebSearchCorpus.ARXIV, - "transformer architecture", - 10, - start_date=start, - end_date=end - ) - - # Search for recent news from the past month - from datetime import timedelta - one_month_ago = datetime.now() - timedelta(days=30) - df = web_search( - WebSearchCorpus.TAVILY, - "AI developments", + WebSearchCorpus.ARXIV, + "transformer architecture", 10, - start_date=one_month_ago + sort_by_date=True, + start_date=datetime(2024, 1, 1), + end_date=datetime(2024, 12, 31), ) +Select Columns +-------------- -Required Parameters --------------------- -- **corpus** : The search corpus to use. Available options: - - ``WebSearchCorpus.ARXIV``: Search academic papers on arxiv.org - - ``WebSearchCorpus.GOOGLE``: Search the web using Google Search - - ``WebSearchCorpus.GOOGLE_SCHOLAR``: Search academic papers using Google Scholar - - ``WebSearchCorpus.YOU``: Search the web using You.com - - ``WebSearchCorpus.TAVILY``: Search the web using Tavily -- **query** : The query to search for -- **K** : The number of results to return - -Optional Parameters --------------------- -- **cols** : The columns to take from the API search results. Default values should be sufficient for most use cases. To see available columns, enable logging: - - .. code-block:: python - - import logging - logging.basicConfig(level=logging.INFO) - -- **start_date** : Optional start date for filtering results (as a ``datetime`` object). - Returns only results created or published on or after this date. - -- **end_date** : Optional end date for filtering results (as a ``datetime`` object). - Returns only results created or published on or before this date. - - -web_extract -======================== - -Overview ---------- -The `web_extract` function allows you to extract full text content from specific URLs or document IDs across different search engines. This is useful when you already know the URL or ID of a document and want to extract its full content for processing with LOTUS. - -The function returns a simple DataFrame with three columns: ``id``, ``url``, and ``full_text``. - -Arxiv Extract Example --------- -To get started, you will need to install the lotus submodule as follows: -.. code-block:: shell - pip install lotus[arxiv] - -Then you can run your lotus program: - -.. code-block:: python - - import lotus - from lotus import WebSearchCorpus, web_extract - from lotus.models import LM - - lm = LM(model="gpt-4o-mini") - - lotus.settings.configure(lm=lm) - - # Extract full text from an arXiv paper using its ID - df = web_extract(WebSearchCorpus.ARXIV, doc_id="2303.08774") - print(f"Extracted from ArXiv:\n{df}\n\n") - - # Use the extracted full text for semantic operations - if df["full_text"].iloc[0]: - print(f"Full text length: {len(df['full_text'].iloc[0])} characters") - - -Tavily Extract Example --------- -Before running the following example, you need to set the `TAVILY_API_KEY` environment variable. You will also need to install the lotus submodule as follows: -.. code-block:: shell - pip install lotus[tavily] - -Then you can run your lotus program: +Use ``cols`` to request a subset of result fields. .. code-block:: python - import lotus - from lotus import WebSearchCorpus, web_extract - from lotus.models import LM + df = web_search( + WebSearchCorpus.TAVILY, + "AI safety evaluations", + 5, + cols=["title", "url", "content"], + ) - lm = LM(model="gpt-4o-mini") +Common default columns include: - lotus.settings.configure(lm=lm) +- arXiv: ``id``, ``title``, ``link``, ``abstract``, ``published``, + ``authors``, ``categories`` +- Google and Google Scholar: ``title``, ``link``, ``snippet``, ``date``, + ``publication_info`` +- You.com: ``title``, ``url``, ``snippets``, ``description`` +- Tavily: ``title``, ``url``, ``content`` +- PubMed: ``id``, ``title``, ``link``, ``abstract``, ``published``, + ``authors``, ``journal``, ``doi``, ``methods``, ``results``, + ``conclusions`` - # Extract full text from a URL using Tavily Extract API - df = web_extract(WebSearchCorpus.TAVILY, url="https://en.wikipedia.org/wiki/Artificial_intelligence") - print(f"Extracted from Tavily:\n{df}\n\n") +Required Setup +-------------- - # Use the extracted full text for semantic operations - if df["full_text"].iloc[0]: - print(f"Full text length: {len(df['full_text'].iloc[0])} characters") +- Google and Google Scholar require ``SERPAPI_API_KEY`` and the ``serpapi`` + extra. +- arXiv requires the ``arxiv`` extra. +- PubMed requires the ``pubmed`` extra. +- You.com requires ``YOU_API_KEY`` and the ``web_search`` extra. +- Tavily requires ``TAVILY_API_KEY`` and the ``web_search`` extra. +.. code-block:: console -PubMed Extract Example --------- -To get started, you will need to install the lotus submodule as follows: -.. code-block:: shell - pip install lotus[pubmed] + $ pip install "lotus-ai[serpapi]" + $ pip install "lotus-ai[arxiv]" + $ pip install "lotus-ai[pubmed]" + $ pip install "lotus-ai[web_search]" -Then you can run your lotus program: +Parameters +---------- .. code-block:: python - import lotus - from lotus import WebSearchCorpus, web_extract - from lotus.models import LM - - lm = LM(model="gpt-4o-mini") - - lotus.settings.configure(lm=lm) - - # Extract full text from a PubMed article using its ID - df = web_extract(WebSearchCorpus.PUBMED, doc_id="12345678") - print(f"Extracted from PubMed:\n{df}\n\n") + web_search( + corpus, + query, + K, + cols=None, + sort_by_date=False, + start_date=None, + end_date=None, + delay=0.1, + ) - # Use the extracted full text for semantic operations - if df["full_text"].iloc[0]: - print(f"Full text length: {len(df['full_text'].iloc[0])} characters") +API Reference +------------- -web_extract Required Parameters --------------------- -- **corpus** : The search corpus to use. Available options: - - ``WebSearchCorpus.ARXIV``: Extract from academic papers on arxiv.org - - ``WebSearchCorpus.GOOGLE``: Extract from URLs using standard HTTP fetching - - ``WebSearchCorpus.GOOGLE_SCHOLAR``: Extract from URLs using standard HTTP fetching - - ``WebSearchCorpus.YOU``: Extract from URLs using standard HTTP fetching - - ``WebSearchCorpus.TAVILY``: Extract from URLs using Tavily Extract API - - ``WebSearchCorpus.PUBMED``: Extract from PubMed articles -- **doc_id** or **url** : Either a corpus-specific identifier (required for ARXIV/PUBMED if url not provided) or a URL to fetch. You must provide exactly one of these parameters. +.. autoclass:: lotus.web_search.WebSearchCorpus + :members: -web_extract Optional Parameters --------------------- -- **max_length** : Optional maximum character length for extracted full text. If provided, the extracted text will be truncated to this length. +.. autofunction:: lotus.web_search.web_search diff --git a/lotus/ast/lazyframe.py b/lotus/ast/lazyframe.py index a604ae48..146bbc08 100644 --- a/lotus/ast/lazyframe.py +++ b/lotus/ast/lazyframe.py @@ -839,7 +839,7 @@ def load(cls, path: str | Path) -> "LazyFrame": def optimize( self, - optimizers: list["BaseOptimizer"], + optimizers: list["BaseOptimizer"] = [], *, inplace: bool = False, train_data: pd.DataFrame | dict["LazyFrame", pd.DataFrame] | None = None, @@ -860,6 +860,10 @@ def optimize( all_optimizers = (DEFAULT_OPTIMIZERS + optimizers) if auto_include_default_optimizers else optimizers + if not all_optimizers: + lotus.logger.warning("LazyFrame.optimize: no optimizers provided, returning original LazyFrame") + return + lotus.logger.debug( f"LazyFrame.optimize: {len(self._nodes)} nodes, " f"{len(all_optimizers)} optimizer(s), inplace={inplace}" ) From 8963ad0f60a9ec4f86e1df3107da6952ae249ccf Mon Sep 17 00:00:00 2001 From: Harshit Gupta Date: Wed, 15 Apr 2026 13:59:56 -0700 Subject: [PATCH 2/3] fix examples --- examples/lazy_frames/lazy_frame_demo.py | 107 ---- examples/lazy_frames/lazyframe.py | 55 ++ .../01_predicate_pushdown.py | 51 ++ .../optimizer_examples/02_gepa_filter.py | 75 +++ .../03_cascade_thresholds.py | 65 +++ .../optimizer_examples/gepa_01_filter.py | 136 ----- .../optimizer_examples/gepa_02_filter_map.py | 141 ----- .../optimizer_examples/gepa_03_selective.py | 133 ----- .../optimizer_examples/gepa_04_train_val.py | 135 ----- .../predicate_pushdown_example.py | 86 ---- .../lazy_frames/rag_pipeline_example.ipynb | 483 ------------------ examples/web_search_examples/tavily_topk.py | 4 +- examples/web_search_examples/you_topk.py | 4 +- 13 files changed, 250 insertions(+), 1225 deletions(-) delete mode 100644 examples/lazy_frames/lazy_frame_demo.py create mode 100644 examples/lazy_frames/lazyframe.py create mode 100644 examples/lazy_frames/optimizer_examples/01_predicate_pushdown.py create mode 100644 examples/lazy_frames/optimizer_examples/02_gepa_filter.py create mode 100644 examples/lazy_frames/optimizer_examples/03_cascade_thresholds.py delete mode 100644 examples/lazy_frames/optimizer_examples/gepa_01_filter.py delete mode 100644 examples/lazy_frames/optimizer_examples/gepa_02_filter_map.py delete mode 100644 examples/lazy_frames/optimizer_examples/gepa_03_selective.py delete mode 100644 examples/lazy_frames/optimizer_examples/gepa_04_train_val.py delete mode 100644 examples/lazy_frames/optimizer_examples/predicate_pushdown_example.py delete mode 100644 examples/lazy_frames/rag_pipeline_example.ipynb diff --git a/examples/lazy_frames/lazy_frame_demo.py b/examples/lazy_frames/lazy_frame_demo.py deleted file mode 100644 index 06336550..00000000 --- a/examples/lazy_frames/lazy_frame_demo.py +++ /dev/null @@ -1,107 +0,0 @@ -"""Demo: LazyFrame with joins and combined semantic + pandas filtering. - -Shows how to: -1. Chain sem_filter with pandas .filter() predicates -2. Use sem_join between two sources -3. Inspect the LazyFrame before execution - -Usage: - export OPENAI_API_KEY="sk-..." - python examples/lazy_frames/lazy_frame_demo.py -""" - -import pandas as pd - -import lotus -from lotus.ast import LazyFrame -from lotus.models import LM - -# ------------------------------------------------------------------ -# Configure the LM -# ------------------------------------------------------------------ -lm = LM(model="gpt-4o-mini") -lotus.settings.configure(lm=lm) - -# ------------------------------------------------------------------ -# Example 1: Combining semantic ops with pandas filtering -# ------------------------------------------------------------------ -print("=" * 60) -print("Example 1: sem_filter + pandas filter + sem_map") -print("=" * 60) - -courses_df = pd.DataFrame( - { - "Course Name": [ - "Probability and Random Processes", - "Optimization Methods in Engineering", - "Digital Design and Integrated Circuits", - "Computer Security", - "Cooking", - "Food Sciences", - ], - "Units": [4, 3, 4, 3, 2, 3], - } -) -print("\nSource data:") -print(courses_df) -print() - -# Build the LazyFrame — no LLM calls happen here -courses_lf = ( - LazyFrame() - .sem_filter("{Course Name} is about engineering or computer science") - .filter(lambda df: df["Units"] >= 3) # pandas predicate - .sem_map("What is a one-sentence summary of {Course Name}?") -) - -print("LazyFrame repr:") -print(repr(courses_lf)) -print() - -print("\nExecuting LazyFrame ...") -result = courses_lf.execute(courses_df) -print("\nResult:") -print(result) - -# ------------------------------------------------------------------ -# Example 2: sem_join between two sources -# ------------------------------------------------------------------ -print("\n" + "=" * 60) -print("Example 2: sem_join of two sources") -print("=" * 60) - -courses_df2 = pd.DataFrame( - { - "Course Name": [ - "Riemannian Geometry", - "Operating Systems", - "Intro to Computer Science", - "Food Science", - ] - } -) -skills_df = pd.DataFrame({"Skill": ["Math", "Computer Science"]}) - -# Create separate LazyFrames for courses and skills -courses_lf2 = LazyFrame() -skills_lf = LazyFrame() - -# Build LazyFrame with join between two sources -join_df = courses_lf2.sem_join(skills_lf, "Taking {Course Name:left} will help me learn {Skill:right}").sem_map( - "Explain how {Course Name} relates to {Skill}" -) - -print("\nSource (courses):") -print(courses_df2) -print("\nSource (skills):") -print(skills_df) -print() - -print("LazyFrame repr:") -print(repr(join_df)) -print() - -print("\nExecuting join LazyFrame ...") -join_result = join_df.execute({courses_lf2: courses_df2, skills_lf: skills_df}) -print("\nJoin result:") -print(join_result) diff --git a/examples/lazy_frames/lazyframe.py b/examples/lazy_frames/lazyframe.py new file mode 100644 index 00000000..3c9f1287 --- /dev/null +++ b/examples/lazy_frames/lazyframe.py @@ -0,0 +1,55 @@ +"""LazyFrame quickstart: define, inspect, and execute one semantic filter. + +Usage: + export OPENAI_API_KEY="..." + python examples/lazy_frames/01_sem_filter_quickstart.py +""" + +import pandas as pd + +import lotus +from lotus.ast import LazyFrame +from lotus.models import LM + +# Configure the LM once before running semantic operators. +lm = LM(model="gpt-4.1-nano") +lotus.settings.configure(lm=lm) + +issues = pd.DataFrame( + { + "issue_title": [ + "Fix typo in README", + "Add dark mode support to dashboard", + "Refactor entire auth system to use OAuth2", + "Update copyright year in LICENSE", + "Implement distributed transaction support across microservices", + "Change button color on settings page", + "Migrate database from Postgres 13 to 16 with zero downtime", + "Add missing comma in error message", + "Build custom query planner to replace third-party dependency", + "Bump lodash to fix known CVE", + "Support multi-region active-active replication", + "Remove unused import in utils.py", + ] + } +) + +# Build the LazyFrame pipeline. No LM calls happen until execute(). +pipeline = LazyFrame().sem_filter( + "The {issue_title} describes a small, self-contained task that a new open " + "source contributor could tackle without deep knowledge of the codebase" +) + +# Inspect the logical plan before spending any LM calls. +print("Logical plan:") +pipeline.print_tree() +# OUTPUT: +# Logical plan: +# sem_filter('The {issue_title} describes a small, self-contained task that a new open source contributor could tackle without deep knowledge of the codebase') +# -- Source(bound=False) + +# Execute the plan on the DataFrame. +good_first_issues = pipeline.execute(issues) + +print("\nGood first issues:") +print(good_first_issues.to_string(index=False)) diff --git a/examples/lazy_frames/optimizer_examples/01_predicate_pushdown.py b/examples/lazy_frames/optimizer_examples/01_predicate_pushdown.py new file mode 100644 index 00000000..11cd79b9 --- /dev/null +++ b/examples/lazy_frames/optimizer_examples/01_predicate_pushdown.py @@ -0,0 +1,51 @@ +"""Optimizer example: predicate pushdown only. + +This example prints the plan before and after optimization. It does not execute +the pipeline or call an LM. + +Usage: + python examples/lazy_frames/optimizer_examples/01_predicate_pushdown.py +""" + +import pandas as pd + +from lotus.ast import LazyFrame + +# Include a cheap pandas predicate and an expensive semantic predicate. +issues = pd.DataFrame( + { + "issue_title": [ + "Fix typo in README", + "Add dark mode support to dashboard", + "Bump lodash to fix known CVE", + ], + "priority": ["low", "medium", "critical"], + } +) + +# The pandas filter is written after sem_filter on purpose. Predicate pushdown +# can move it earlier so fewer rows reach the semantic filter. +pipeline = ( + LazyFrame(df=issues) + .sem_filter("{issue_title} is a good first issue") + .filter(lambda df: df["priority"] != "critical") +) + +# Calling optimize([]) still runs default optimizers, including predicate pushdown. +optimized = pipeline.optimize([]) + +print("Before optimization:") +pipeline.print_tree() +# OUTPUT: +# Before optimization: +# filter(...) +# -- sem_filter('{issue_title} is a good first issue') +# -- Source(bound=True) + +print("\nAfter optimization:") +optimized.print_tree() +# OUTPUT: +# After optimization: +# sem_filter('{issue_title} is a good first issue') +# -- filter(...) +# -- Source(bound=True) \ No newline at end of file diff --git a/examples/lazy_frames/optimizer_examples/02_gepa_filter.py b/examples/lazy_frames/optimizer_examples/02_gepa_filter.py new file mode 100644 index 00000000..1d72c50f --- /dev/null +++ b/examples/lazy_frames/optimizer_examples/02_gepa_filter.py @@ -0,0 +1,75 @@ +"""Optimizer example: GEPA tunes one sem_filter instruction. + +Requirements: + pip install "lotus-ai[gepa]" + export OPENAI_API_KEY="..." + +Usage: + python examples/lazy_frames/optimizer_examples/02_gepa_filter.py +""" + +import pandas as pd + +import lotus +from lotus.ast import LazyFrame +from lotus.ast.optimizer import GEPAOptimizer +from lotus.models import LM +from gepa.optimize_anything import EngineConfig, GEPAConfig + +# Configure the LM that runs the semantic filter and GEPA candidates. +lm = LM(model="gpt-4.1-nano") +lotus.settings.configure(lm=lm) + +# Training data for the optimizer. The eval function below scores which rows +# the candidate pipeline keeps. +issues = pd.DataFrame( + { + "issue_title": [ + "Fix typo in README", + "Add dark mode support to dashboard", + "Refactor entire auth system to use OAuth2", + "Update copyright year in LICENSE", + "Implement distributed transaction support across microservices", + "Change button color on settings page", + "Migrate database from Postgres 13 to 16 with zero downtime", + "Add missing comma in error message", + "Build custom query planner to replace third-party dependency", + "Bump lodash to fix known CVE", + "Support multi-region active-active replication", + "Remove unused import in utils.py", + ] + } +) + +GOOD_FIRST_ISSUE_IDS = {0, 3, 5, 7, 9, 11} + + +# Score a candidate pipeline by F1 against hand-labeled good-first-issue rows. +def eval_fn(output_df: pd.DataFrame, example: dict) -> tuple[float, dict[str, float]]: + kept = set(output_df.index) + true_positive = len(kept & GOOD_FIRST_ISSUE_IDS) + precision = true_positive / max(len(kept), 1) + recall = true_positive / max(len(GOOD_FIRST_ISSUE_IDS), 1) + f1 = 2 * precision * recall / max(precision + recall, 1e-9) + return f1, {"precision": precision, "recall": recall} + + +# Start with a simple prompt that GEPA can improve. +pipeline = LazyFrame().sem_filter("{issue_title} is a small starter task") + +# GEPA rewrites optimizable instructions to maximize eval_fn. +optimizer = GEPAOptimizer( + eval_fn=eval_fn, + objective=( + "Find a sem_filter instruction that keeps issue titles suitable for new open " + "source contributors: small, self-contained tasks that do not require deep " + "codebase knowledge." + ), + gepa_config=GEPAConfig(engine=EngineConfig(max_metric_calls=20)), +) + +# Optimize on the labeled examples, then execute the optimized pipeline. +optimized = pipeline.optimize([optimizer], train_data=issues) +result = optimized.execute(issues) + +print(result.to_string(index=True)) diff --git a/examples/lazy_frames/optimizer_examples/03_cascade_thresholds.py b/examples/lazy_frames/optimizer_examples/03_cascade_thresholds.py new file mode 100644 index 00000000..775d2040 --- /dev/null +++ b/examples/lazy_frames/optimizer_examples/03_cascade_thresholds.py @@ -0,0 +1,65 @@ +"""Optimizer example: pre-warm sem_filter cascade thresholds. + +Usage: + export OPENAI_API_KEY="..." + python examples/lazy_frames/optimizer_examples/03_cascade_thresholds.py +""" + +import pandas as pd + +import lotus +from lotus.ast import LazyFrame +from lotus.ast.optimizer import CascadeOptimizer +from lotus.models import LM +from lotus.types import CascadeArgs + +# Configure a main LM and a cheaper helper LM for the cascade. +lotus.settings.configure( + lm=LM(model="gpt-4o"), + helper_lm=LM(model="gpt-4o-mini"), +) + +# Use the shared issue-title dataset so the example stays focused on cascades. +issues = pd.DataFrame( + { + "issue_title": [ + "Fix typo in README", + "Add dark mode support to dashboard", + "Refactor entire auth system to use OAuth2", + "Update copyright year in LICENSE", + "Implement distributed transaction support across microservices", + "Change button color on settings page", + "Migrate database from Postgres 13 to 16 with zero downtime", + "Add missing comma in error message", + "Build custom query planner to replace third-party dependency", + "Bump lodash to fix known CVE", + "Support multi-region active-active replication", + "Remove unused import in utils.py", + ] + } +) + +# CascadeArgs defines the target accuracy/cost tradeoff. +cascade_args = CascadeArgs( + recall_target=0.9, + precision_target=0.9, + sampling_percentage=0.5, + failure_probability=0.2, +) + +# Attach the cascade to the semantic filter. +pipeline = LazyFrame().sem_filter( + "{issue_title} is a good first issue", + cascade_args=cascade_args, +) + +# CascadeOptimizer learns thresholds on training data before full execution. +optimized = pipeline.optimize([CascadeOptimizer()], train_data=issues) + +# Print the cascade arguments +learned_cascade_args = optimized._nodes[1].cascade_args +print(f"Learned cascade thresholds: {learned_cascade_args.pos_cascade_threshold}, {learned_cascade_args.neg_cascade_threshold}") + +# Execute the optimized pipeline, the thresholds are not learned again +result = optimized.execute(issues) +print(result.to_string(index=False)) diff --git a/examples/lazy_frames/optimizer_examples/gepa_01_filter.py b/examples/lazy_frames/optimizer_examples/gepa_01_filter.py deleted file mode 100644 index ae135d34..00000000 --- a/examples/lazy_frames/optimizer_examples/gepa_01_filter.py +++ /dev/null @@ -1,136 +0,0 @@ -"""GEPA optimizer — Example 1: Optimizing a sem_filter instruction. - -Starts from an overly strict initial filter and uses GEPA to automatically -broaden it to correctly capture all positive reviews (including lukewarm ones), -measured by F1 score. Rich side-info tells the reflection LLM exactly which -reviews were missed and which were incorrectly kept. - -Requirements: - pip install gepa - export OPENAI_API_KEY="..." -""" - -import pandas as pd - -import lotus -from lotus.ast import LazyFrame -from lotus.ast.optimizer import GEPAOptimizer -from lotus.models import LM - -lotus.settings.configure(lm=LM(model="gpt-4o-mini")) - -# --------------------------------------------------------------------------- -# Data — product reviews with ground-truth sentiment labels -# --------------------------------------------------------------------------- - -reviews_df = pd.DataFrame( - { - "review": [ - "Absolutely love this product, works perfectly!", # 0 positive - "Complete waste of money, broke after a week.", # 1 negative - "It's okay, nothing special but does the job.", # 2 neutral - "Best purchase I've made this year, highly recommend!", # 3 positive - "Terrible quality, very disappointed.", # 4 negative - "Pretty good value for the price.", # 5 positive (lukewarm) - "Arrived damaged and customer service was unhelpful.", # 6 negative - "Exceeded my expectations, will buy again.", # 7 positive - ] - } -) - -POSITIVE_IDX = {0, 3, 5, 7} # ground-truth positive rows - - -# --------------------------------------------------------------------------- -# Evaluation function — F1 score with rich diagnostic side info -# --------------------------------------------------------------------------- - - -def eval_fn(output_df: pd.DataFrame, example: dict) -> tuple[float, dict]: - """F1 + side info that tells GEPA's reflection LLM exactly what was missed.""" - kept = set(output_df.index) - tp = len(POSITIVE_IDX & kept) - precision = tp / max(len(kept), 1) - recall = tp / max(len(POSITIVE_IDX), 1) - f1 = 2 * precision * recall / max(precision + recall, 1e-9) - - missed_texts = [reviews_df.loc[i, "review"] for i in sorted(POSITIVE_IDX - kept)] - false_pos = [reviews_df.loc[i, "review"] for i in sorted(kept - POSITIVE_IDX)] - - return f1, { - "precision": round(precision, 3), - "recall": round(recall, 3), - "missed_positives": missed_texts, # reviews that should have been kept - "false_positives": false_pos, # reviews that should NOT have been kept - } - - -# --------------------------------------------------------------------------- -# Build the initial (unoptimized) LazyFrame -# --------------------------------------------------------------------------- - -# Start with an overly strict instruction — it will miss "Pretty good value for -# the price." (a lukewarm positive) because the model treats it as not enthusiastic. -lf = LazyFrame().sem_filter("{review} is an extremely enthusiastic positive review with strong praise") - -print("=" * 70) -print("Initial LazyFrame:") -print("=" * 70) -lf.print_tree() - -# --------------------------------------------------------------------------- -# Optimize -# --------------------------------------------------------------------------- - -try: - from gepa.optimize_anything import EngineConfig, GEPAConfig # type: ignore[import] -except ImportError: - print("\n[gepa not installed — run `pip install gepa` to continue]\n") - raise SystemExit(0) - -optimizer = GEPAOptimizer( - eval_fn=eval_fn, - # Explicit objective guides the reflection LLM toward broader coverage - objective=( - "Find a filter instruction that keeps ALL positive reviews — including lukewarm " - "ones like 'pretty good value' — while correctly excluding negative and neutral ones. " - "Maximize F1 score (harmonic mean of precision and recall). " - "If missed_positives is non-empty, the instruction is too strict and must be broadened." - ), - gepa_config=GEPAConfig(engine=EngineConfig(max_metric_calls=30)), - valset=reviews_df.iloc[5:].reset_index(drop=True), -) - -optimized_lf = lf.optimize([optimizer], train_data=reviews_df.iloc[:5].reset_index(drop=True)) - -# --------------------------------------------------------------------------- -# Show results -# --------------------------------------------------------------------------- - -from lotus.ast.nodes import SemFilterNode # noqa: E402 - -original_instruction = next(n for n in lf._nodes if isinstance(n, SemFilterNode)).user_instruction -optimized_instruction = next(n for n in optimized_lf._nodes if isinstance(n, SemFilterNode)).user_instruction - -print("\n" + "=" * 70) -print("Instruction comparison:") -print("=" * 70) -print(f" Before: {original_instruction!r}") -print(f" After: {optimized_instruction!r}") - -# Execute and show the filtered output -print("\n" + "=" * 70) -print("Filtered reviews (optimized LazyFrame):") -print("=" * 70) -result_df = optimized_lf.execute(reviews_df) -print(result_df[["review"]].to_string(index=True)) - -kept = set(result_df.index) -tp = len(POSITIVE_IDX & kept) -precision = tp / max(len(kept), 1) -recall = tp / max(len(POSITIVE_IDX), 1) -f1 = 2 * precision * recall / max(precision + recall, 1e-9) -print( - f"\nKept {len(result_df)}/{len(reviews_df)} reviews | F1={f1:.3f} " - f"(precision={precision:.3f}, recall={recall:.3f})" -) diff --git a/examples/lazy_frames/optimizer_examples/gepa_02_filter_map.py b/examples/lazy_frames/optimizer_examples/gepa_02_filter_map.py deleted file mode 100644 index a59a78f4..00000000 --- a/examples/lazy_frames/optimizer_examples/gepa_02_filter_map.py +++ /dev/null @@ -1,141 +0,0 @@ -"""GEPA optimizer — Example 2: Jointly optimizing a filter + map LazyFrame. - -Demonstrates multi-step optimization: both the sem_filter instruction (which -papers to keep) and the sem_map instruction (how to summarize them) are evolved -simultaneously by GEPA to maximize a composite score. - -Requirements: - pip install gepa - export OPENAI_API_KEY="..." -""" - -import pandas as pd - -import lotus -from lotus.ast import LazyFrame -from lotus.ast.optimizer import GEPAOptimizer -from lotus.models import LM - -lotus.settings.configure(lm=LM(model="gpt-4o-mini")) - -# --------------------------------------------------------------------------- -# Data — ML papers; goal: keep NLP/LLM papers and produce concise summaries -# --------------------------------------------------------------------------- - -papers_df = pd.DataFrame( - { - "title": [ - "Attention Is All You Need", - "BERT: Pre-training of Deep Bidirectional Transformers", - "ImageNet Classification with Deep CNNs", - "Generative Adversarial Networks", - "Deep Residual Learning for Image Recognition", - "GPT-3: Language Models are Few-Shot Learners", - ], - "abstract": [ - "We propose the Transformer architecture based solely on attention mechanisms, " - "dispensing with recurrence entirely.", - "We introduce BERT, a deeply bidirectional language representation model " "pre-trained on unlabeled text.", - "We trained a large deep convolutional neural network to classify 1.2 million " - "high-resolution images in the ImageNet LSVRC-2010 contest.", - "We propose a framework for estimating generative models via an adversarial " - "process, training two models simultaneously.", - "We present a residual learning framework to ease training of very deep networks. " - "Reformulating layers as learning residual functions.", - "We demonstrate that scaling up language models greatly improves task-agnostic " - "few-shot performance across many NLP benchmarks.", - ], - } -) - -NLP_IDX = {0, 1, 5} # Transformer, BERT, GPT-3 — the NLP papers - - -# --------------------------------------------------------------------------- -# Evaluation — reward keeping NLP papers AND producing short summaries -# --------------------------------------------------------------------------- - - -def eval_fn(output_df: pd.DataFrame, example: dict) -> tuple[float, dict]: - kept = set(output_df.index) - recall = len(NLP_IDX & kept) / len(NLP_IDX) - precision = len(NLP_IDX & kept) / max(len(kept), 1) - f1 = 2 * precision * recall / max(precision + recall, 1e-9) - - # Bonus for concise summaries (under 150 characters) - summary_col = "summary" - if summary_col in output_df.columns and len(output_df) > 0: - concise = sum(len(str(s)) < 150 for s in output_df[summary_col].dropna()) - brevity_score = concise / len(output_df) - else: - brevity_score = 0.0 - - score = 0.7 * f1 + 0.3 * brevity_score - return score, {"f1": round(f1, 3), "recall": round(recall, 3), "brevity": round(brevity_score, 3)} - - -# --------------------------------------------------------------------------- -# Build the initial LazyFrame -# --------------------------------------------------------------------------- - -lf = ( - LazyFrame(df=papers_df) - .sem_filter("{title} and {abstract} are about natural language processing") - .sem_map("Write a one-sentence summary of {abstract}", suffix="summary") -) - -print("=" * 70) -print("Initial LazyFrame:") -print("=" * 70) -lf.print_tree() - -# --------------------------------------------------------------------------- -# Optimize -# --------------------------------------------------------------------------- - -try: - from gepa.optimize_anything import EngineConfig, GEPAConfig # type: ignore[import] -except ImportError: - print("\n[gepa not installed — run `pip install gepa` to continue]\n") - raise SystemExit(0) - -optimizer = GEPAOptimizer( - eval_fn=eval_fn, - gepa_config=GEPAConfig(engine=EngineConfig(max_metric_calls=30)), -) - -optimized_lf = lf.optimize([optimizer], train_data=papers_df) - -# --------------------------------------------------------------------------- -# Show results -# --------------------------------------------------------------------------- - -from lotus.ast.nodes import SemFilterNode, SemMapNode # noqa: E402 - -orig_filter = next(n for n in lf._nodes if isinstance(n, SemFilterNode)).user_instruction -opt_filter = next(n for n in optimized_lf._nodes if isinstance(n, SemFilterNode)).user_instruction -orig_map = next(n for n in lf._nodes if isinstance(n, SemMapNode)).user_instruction -opt_map = next(n for n in optimized_lf._nodes if isinstance(n, SemMapNode)).user_instruction - -print("\n" + "=" * 70) -print("Optimized LazyFrame:") -print("=" * 70) -optimized_lf.print_tree() - -print("\n" + "=" * 70) -print("Instruction comparison:") -print("=" * 70) -print(f" Filter before: {orig_filter!r}") -print(f" Filter after: {opt_filter!r}") -print(f" Map before: {orig_map!r}") -print(f" Map after: {opt_map!r}") - -print("\n" + "=" * 70) -print("Output (optimized LazyFrame):") -print("=" * 70) -result_df = optimized_lf.execute(papers_df) -for _, row in result_df.iterrows(): - print(f" [{row.name}] {row['title']}") - if "summary" in result_df.columns: - print(f" → {row['summary']}") -print(f"\nKept {len(result_df)}/{len(papers_df)} papers") diff --git a/examples/lazy_frames/optimizer_examples/gepa_03_selective.py b/examples/lazy_frames/optimizer_examples/gepa_03_selective.py deleted file mode 100644 index eb9133b9..00000000 --- a/examples/lazy_frames/optimizer_examples/gepa_03_selective.py +++ /dev/null @@ -1,133 +0,0 @@ -"""GEPA optimizer — Example 3: Selective optimization with mark_optimizable. - -Shows how to pin specific nodes so GEPA leaves them unchanged while still -optimizing the rest. The map instruction is locked (mark_optimizable=[]), -so only the filter instruction is evolved. - -Requirements: - pip install gepa - export OPENAI_API_KEY="..." -""" - -import pandas as pd - -import lotus -from lotus.ast import LazyFrame -from lotus.ast.optimizer import GEPAOptimizer -from lotus.models import LM - -lotus.settings.configure(lm=LM(model="gpt-4o-mini")) - -# --------------------------------------------------------------------------- -# Data — same papers as Example 2 -# --------------------------------------------------------------------------- - -papers_df = pd.DataFrame( - { - "title": [ - "Attention Is All You Need", - "BERT: Pre-training of Deep Bidirectional Transformers", - "ImageNet Classification with Deep CNNs", - "Generative Adversarial Networks", - "Deep Residual Learning for Image Recognition", - "GPT-3: Language Models are Few-Shot Learners", - ], - "abstract": [ - "We propose the Transformer architecture based solely on attention mechanisms, " - "dispensing with recurrence entirely.", - "We introduce BERT, a deeply bidirectional language representation model " "pre-trained on unlabeled text.", - "We trained a large deep convolutional neural network to classify 1.2 million " - "high-resolution images in the ImageNet LSVRC-2010 contest.", - "We propose a framework for estimating generative models via an adversarial " - "process, training two models simultaneously.", - "We present a residual learning framework to ease training of very deep networks. " - "Reformulating layers as learning residual functions.", - "We demonstrate that scaling up language models greatly improves task-agnostic " - "few-shot performance across many NLP benchmarks.", - ], - } -) - -NLP_IDX = {0, 1, 5} - - -def eval_fn(output_df: pd.DataFrame, example: dict) -> float: - kept = set(output_df.index) - tp = len(NLP_IDX & kept) - precision = tp / max(len(kept), 1) - recall = tp / max(len(NLP_IDX), 1) - return 2 * precision * recall / max(precision + recall, 1e-9) - - -# --------------------------------------------------------------------------- -# Build LazyFrame — map is pinned via mark_optimizable=[] -# --------------------------------------------------------------------------- - -PINNED_MAP = "Summarize {abstract} in exactly one sentence" - -lf = ( - LazyFrame(df=papers_df) - .sem_filter("{title} and {abstract} are about NLP") - # mark_optimizable=[] → GEPA will never touch this node - .sem_map(PINNED_MAP, suffix="summary", mark_optimizable=[]) -) - -print("=" * 70) -print("Initial LazyFrame (map instruction is pinned):") -print("=" * 70) -lf.print_tree() - -print("\nNodes eligible for optimization:") -for i, node in enumerate(lf._nodes): - if hasattr(node, "optimizable_params"): - status = "PINNED" if node.optimizable_params == frozenset() else "optimizable" - print(f" [{i}] {type(node).__name__:20s} → {status}") - -# --------------------------------------------------------------------------- -# Optimize -# --------------------------------------------------------------------------- - -try: - from gepa.optimize_anything import EngineConfig, GEPAConfig # type: ignore[import] -except ImportError: - print("\n[gepa not installed — run `pip install gepa` to continue]\n") - raise SystemExit(0) - -optimizer = GEPAOptimizer( - eval_fn=eval_fn, - gepa_config=GEPAConfig(engine=EngineConfig(max_metric_calls=20)), -) - -optimized_lf = lf.optimize([optimizer], train_data=papers_df) - -# --------------------------------------------------------------------------- -# Verify the map instruction was not changed -# --------------------------------------------------------------------------- - -from lotus.ast.nodes import SemFilterNode, SemMapNode # noqa: E402 - -opt_filter = next(n for n in optimized_lf._nodes if isinstance(n, SemFilterNode)).user_instruction -opt_map = next(n for n in optimized_lf._nodes if isinstance(n, SemMapNode)).user_instruction - -print("\n" + "=" * 70) -print("After optimization:") -print("=" * 70) -optimized_lf.print_tree() - -print("\n" + "=" * 70) -print("Instruction comparison:") -print("=" * 70) -print(f" Filter (evolved): {opt_filter!r}") -print(f" Map (pinned): {opt_map!r}") -assert opt_map == PINNED_MAP, "ERROR: pinned map instruction was changed!" -print(" ✓ Map instruction unchanged (as expected)") - -print("\n" + "=" * 70) -print("Output (optimized LazyFrame):") -print("=" * 70) -result_df = optimized_lf.execute(papers_df) -for _, row in result_df.iterrows(): - print(f" [{row.name}] {row['title']}") - if "summary" in result_df.columns: - print(f" → {row['summary']}") -print(f"\nKept {len(result_df)}/{len(papers_df)} papers") diff --git a/examples/lazy_frames/optimizer_examples/gepa_04_train_val.py b/examples/lazy_frames/optimizer_examples/gepa_04_train_val.py deleted file mode 100644 index d9ab804e..00000000 --- a/examples/lazy_frames/optimizer_examples/gepa_04_train_val.py +++ /dev/null @@ -1,135 +0,0 @@ -"""GEPA optimizer — Example 4: Train / val split for generalization. - -By passing a held-out validation set, GEPA selects the best candidate based -on generalization performance rather than training performance alone. This -prevents overfitting to the training examples. - -Requirements: - pip install gepa - export OPENAI_API_KEY="..." -""" - -import pandas as pd - -import lotus -from lotus.ast import LazyFrame -from lotus.ast.optimizer import GEPAOptimizer -from lotus.models import LM - -lotus.settings.configure(lm=LM(model="gpt-4o-mini")) - -# --------------------------------------------------------------------------- -# Data — 8 reviews split into train (5) and val (3) -# --------------------------------------------------------------------------- - -all_reviews = pd.DataFrame( - { - "review": [ - "Absolutely love this product, works perfectly!", # 0 positive - "Complete waste of money, broke after a week.", # 1 negative - "Best purchase I've made this year, highly recommend!", # 2 positive - "Terrible quality, very disappointed.", # 3 negative - "Pretty good value for the price.", # 4 positive - "Arrived damaged and customer service was unhelpful.", # 5 negative - "Exceeded my expectations, will buy again.", # 6 positive - "It does what it says, nothing more.", # 7 neutral-positive - ] - } -) - -train_df = all_reviews.iloc[:5].reset_index(drop=True) # rows 0-4 from original -val_df = all_reviews.iloc[5:].reset_index(drop=True) # rows 5-7 from original - -# Ground truth per split (indices after reset_index) -# train_df: 0=positive, 1=negative, 2=positive, 3=negative, 4=positive -POSITIVE_TRAIN = {0, 2, 4} -# val_df: 0="Arrived damaged" (negative), 1="Exceeded my expectations" (positive), -# 2="It does what it says" (neutral → not positive) -POSITIVE_VAL = {1} - - -# --------------------------------------------------------------------------- -# Shared evaluation function (works for both splits via the example dict) -# --------------------------------------------------------------------------- - - -def eval_fn(output_df: pd.DataFrame, example: dict) -> tuple[float, dict]: - """F1 score against the ground truth stored in example['expected_positive'].""" - expected = example.get("expected_positive", POSITIVE_TRAIN) - kept = set(output_df.index) - tp = len(expected & kept) - precision = tp / max(len(kept), 1) - recall = tp / max(len(expected), 1) - f1 = 2 * precision * recall / max(precision + recall, 1e-9) - return f1, {"precision": round(precision, 3), "recall": round(recall, 3)} - - -# --------------------------------------------------------------------------- -# Build the initial LazyFrame (trained on train_df) -# --------------------------------------------------------------------------- - -lf = LazyFrame(df=train_df).sem_filter("{review} is a positive review") - -print("=" * 70) -print("Initial LazyFrame:") -print("=" * 70) -lf.print_tree() -print(f"\nTraining examples : {len(train_df)} reviews") -print(f"Validation examples: {len(val_df)} reviews") - -# --------------------------------------------------------------------------- -# Optimize with separate train / val sets -# --------------------------------------------------------------------------- - -try: - from gepa.optimize_anything import EngineConfig, GEPAConfig # type: ignore[import] -except ImportError: - print("\n[gepa not installed — run `pip install gepa` to continue]\n") - raise SystemExit(0) - -# valset examples use the same eval_fn, but we inject the correct ground truth -optimizer = GEPAOptimizer( - eval_fn=eval_fn, - valset=[ - {"input": val_df, "expected_positive": POSITIVE_VAL}, - ], - gepa_config=GEPAConfig(engine=EngineConfig(max_metric_calls=30)), -) - -# Training data: list with one example so we can also inject expected_positive -optimized_lf = lf.optimize( - [optimizer], - train_data=[{"input": train_df, "expected_positive": POSITIVE_TRAIN}], -) - -# --------------------------------------------------------------------------- -# Show results -# --------------------------------------------------------------------------- - -from lotus.ast.nodes import SemFilterNode # noqa: E402 - -orig_instruction = next(n for n in lf._nodes if isinstance(n, SemFilterNode)).user_instruction -opt_instruction = next(n for n in optimized_lf._nodes if isinstance(n, SemFilterNode)).user_instruction - -print("\n" + "=" * 70) -print("Optimized LazyFrame:") -print("=" * 70) -optimized_lf.print_tree() - -print("\n" + "=" * 70) -print("Instruction comparison:") -print("=" * 70) -print(f" Before: {orig_instruction!r}") -print(f" After: {opt_instruction!r}") - -print("\n" + "=" * 70) -print("Generalization check — running on val set:") -print("=" * 70) -val_result = optimized_lf.execute(val_df) -print(val_result[["review"]].to_string(index=True)) -kept_val = set(val_result.index) -tp = len(POSITIVE_VAL & kept_val) -precision = tp / max(len(kept_val), 1) -recall = tp / max(len(POSITIVE_VAL), 1) -f1 = 2 * precision * recall / max(precision + recall, 1e-9) -print(f"\nVal F1={f1:.3f} (precision={precision:.3f}, recall={recall:.3f})") diff --git a/examples/lazy_frames/optimizer_examples/predicate_pushdown_example.py b/examples/lazy_frames/optimizer_examples/predicate_pushdown_example.py deleted file mode 100644 index 388935bc..00000000 --- a/examples/lazy_frames/optimizer_examples/predicate_pushdown_example.py +++ /dev/null @@ -1,86 +0,0 @@ -"""Example demonstrating predicate pushdown optimization. - -This example shows how the optimizer moves pandas filters before semantic filters -to improve performance by reducing the number of rows processed by expensive -semantic operations. -""" - -import pandas as pd - -from lotus.ast import LazyFrame -from lotus.ast.optimizer.predicate_pushdown import PredicatePushdownOptimizer - -# Create sample data -data = pd.DataFrame( - { - "name": ["Alice", "Bob", "Charlie", "David", "Eve"], - "age": [25, 30, 35, 40, 45], - "score": [85, 90, 75, 95, 80], - "department": ["Engineering", "Sales", "Engineering", "Marketing", "Sales"], - } -) - -print("=" * 80) -print("PREDICATE PUSHDOWN OPTIMIZATION EXAMPLE") -print("=" * 80) -print() - -# Create a LazyFrame with semantic filter followed by pandas filter -# This is suboptimal because we process all rows with the expensive semantic filter -# before applying the cheap pandas filter -print("BEFORE OPTIMIZATION:") -print("-" * 80) -lf_before = ( - LazyFrame() - .sem_filter("{name} works in the Engineering department") - .filter(lambda df: df["age"] > 30) - .sem_map("Summarize the role and experience of {name} in the {department} department") -) - -print("LazyFrame structure:") -print(lf_before) -print() -print("Execution order:") -print(" 1. Source: Load data") -print(" 2. sem_filter: Process ALL rows with expensive semantic operation") -print(" 3. filter: Apply cheap pandas filter (age > 30)") -print(" 4. sem_map: Process remaining rows") -print() -print("Problem: We're doing expensive semantic filtering on rows that will") -print(" be filtered out anyway by the pandas filter!") -print() - -# Apply optimization using LazyFrame.optimize() -lf_after = lf_before.optimize([PredicatePushdownOptimizer()]) - -print("AFTER OPTIMIZATION:") -print("-" * 80) -print("LazyFrame structure:") -print(lf_after) -print() -print("Execution order:") -print(" 1. Source: Load data") -print(" 2. filter: Apply cheap pandas filter (age > 30) FIRST") -print(" 3. sem_filter: Process FEWER rows with expensive semantic operation") -print(" 4. sem_map: Process remaining rows") -print() -print("Benefit: The pandas filter reduces the number of rows BEFORE the") -print(" expensive semantic filter, improving performance!") -print() - -# Show the difference in node order -print("NODE ORDER COMPARISON:") -print("-" * 80) -print("Before:") -for i, node in enumerate(lf_before._nodes): - print(f" {i}. {type(node).__name__}") - -print() -print("After:") -for i, node in enumerate(lf_after._nodes): - print(f" {i}. {type(node).__name__}") - -print() -print("=" * 80) -print("The filter node has been moved earlier in the LazyFrame!") -print("=" * 80) diff --git a/examples/lazy_frames/rag_pipeline_example.ipynb b/examples/lazy_frames/rag_pipeline_example.ipynb deleted file mode 100644 index 789046d0..00000000 --- a/examples/lazy_frames/rag_pipeline_example.ipynb +++ /dev/null @@ -1,483 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "04e51f7c", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/harshitgupta/Documents/lotus/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], - "source": [ - "import pandas as pd\n", - "\n", - "import lotus\n", - "from lotus.ast import LazyFrame\n", - "from lotus.models import LM" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "76d5d594", - "metadata": {}, - "outputs": [], - "source": [ - "# ------------------------------------------------------------------\n", - "# Configure the LM\n", - "# ------------------------------------------------------------------\n", - "lm = LM(model=\"gpt-4.1\", max_batch_size=10)\n", - "lotus.settings.configure(lm=lm)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "f690645e", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "======================================================================\n", - "Step 0: Setting up data sources\n", - "======================================================================\n", - "\n", - "Queries:\n", - " query_id query\n", - "0 1 What are the health benefits of exercise and w...\n", - "1 2 How does climate change affect biodiversity an...\n", - "\n", - "Document corpus:\n", - " doc_id content\n", - "0 1 Regular exercise improves cardiovascular healt...\n", - "1 2 Walking is an excellent low-impact exercise fo...\n", - "2 3 Strength training helps build muscle mass and ...\n", - "3 4 Exercise releases endorphins, which are natura...\n", - "4 5 Swimming is a great full-body workout that's e...\n", - "5 6 Climate change is causing rising temperatures ...\n", - "6 7 Habitat loss due to climate change is a major ...\n", - "7 8 Conservation efforts like protected areas and ...\n", - "8 9 Reducing carbon emissions is crucial for prote...\n", - "9 10 Community-based conservation programs involve ...\n" - ] - } - ], - "source": [ - "# ------------------------------------------------------------------\n", - "# Define the data\n", - "# ------------------------------------------------------------------\n", - "print(\"=\" * 70)\n", - "print(\"Step 0: Setting up data sources\")\n", - "print(\"=\" * 70)\n", - "\n", - "queries_df = pd.DataFrame(\n", - " {\n", - " \"query_id\": [1, 2],\n", - " \"query\": [\n", - " \"What are the health benefits of exercise and what types of exercise are best for beginners?\",\n", - " \"How does climate change affect biodiversity and what can be done to protect endangered species?\",\n", - " ],\n", - " }\n", - ")\n", - "print(\"\\nQueries:\")\n", - "print(queries_df)\n", - "\n", - "corpus_df = pd.DataFrame(\n", - " {\n", - " \"doc_id\": range(1, 11),\n", - " \"content\": [\n", - " # Health and exercise documents\n", - " \"Regular exercise improves cardiovascular health by strengthening the heart muscle and improving blood circulation. It reduces the risk of heart disease and stroke.\",\n", - " \"Walking is an excellent low-impact exercise for beginners. It requires no special equipment and can be done anywhere. Start with 15-20 minutes daily.\",\n", - " \"Strength training helps build muscle mass and increases metabolism. Beginners should start with bodyweight exercises like squats and push-ups.\",\n", - " \"Exercise releases endorphins, which are natural mood boosters. Regular physical activity can reduce symptoms of depression and anxiety.\",\n", - " \"Swimming is a great full-body workout that's easy on the joints. It's ideal for beginners and those with mobility issues.\",\n", - " # Climate and biodiversity documents\n", - " \"Climate change is causing rising temperatures that disrupt ecosystems worldwide. Many species cannot adapt quickly enough to survive.\",\n", - " \"Habitat loss due to climate change is a major threat to biodiversity. Polar bears, coral reefs, and rainforest species are particularly vulnerable.\",\n", - " \"Conservation efforts like protected areas and wildlife corridors help endangered species survive climate impacts.\",\n", - " \"Reducing carbon emissions is crucial for protecting biodiversity. Transitioning to renewable energy can slow climate change effects.\",\n", - " \"Community-based conservation programs involve local people in protecting endangered species and their habitats.\",\n", - " ],\n", - " }\n", - ")\n", - "\n", - "\n", - "print(\"\\nDocument corpus:\")\n", - "print(corpus_df)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "9ef2fd27", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "======================================================================\n", - "Step 1: Building LazyFrames (lazy - no execution yet)\n", - "======================================================================\n", - "\n", - "Query LazyFrame:\n", - "sem_map('Break down {query} into 2-3 simpler, focused subqu...')\n", - " -- Source(bound=True)\n", - "\n", - "Corpus LazyFrame:\n", - "Source(bound=True)\n" - ] - } - ], - "source": [ - "print(\"\\n\" + \"=\" * 70)\n", - "print(\"Step 1: Building LazyFrames (lazy - no execution yet)\")\n", - "print(\"=\" * 70)\n", - "\n", - "# LazyFrame 1: Query decomposition\n", - "# Split queries into subqueries, then explode into separate rows\n", - "queries_lf = LazyFrame(df=queries_df)\n", - "query_df = queries_lf.sem_map(\n", - " \"Break down {query} into 2-3 simpler, focused subqueries that would help answer the main query. \"\n", - " \"Return ONLY a newline-separated list of subqueries, nothing else. \"\n", - " \"Each subquery should be on its own line.\",\n", - " suffix=\"_subqueries\",\n", - ")\n", - "\n", - "print(\"\\nQuery LazyFrame:\")\n", - "print(query_df.show())\n", - "\n", - "# LazyFrame 2: Corpus (just a source)\n", - "corpus_lf = LazyFrame(df=corpus_df)\n", - "\n", - "print(\"\\nCorpus LazyFrame:\")\n", - "print(corpus_lf.show())" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "a607cfda", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "======================================================================\n", - "Step 2: Explode subqueries into separate rows\n", - "======================================================================\n", - "\n", - "Exploded subqueries LazyFrame:\n", - "reset_index(drop=True)\n", - " -- [['query_id', 'query', 'subquery']]\n", - " -- ['subquery'] = ...\n", - " -- rename(columns={'subquery_list': 'subquery'})\n", - " -- reset_index(drop=True)\n", - " -- explode('subquery_list')\n", - " -- ['subquery_list'] = ...\n", - " -- sem_map('Break down {query} into 2-3 simpler, focused subqu...')\n", - " -- Source(bound=True)\n" - ] - } - ], - "source": [ - "print(\"\\n\" + \"=\" * 70)\n", - "print(\"Step 2: Explode subqueries into separate rows\")\n", - "print(\"=\" * 70)\n", - "\n", - "# Process subqueries: split, explode, and clean\n", - "query_df[\"subquery_list\"] = query_df[\"_subqueries\"].map(lambda x: x.strip().split(\"\\n\"))\n", - "exploded_df = query_df.explode(\"subquery_list\").reset_index(drop=True)\n", - "exploded_df = exploded_df.rename(columns={\"subquery_list\": \"subquery\"})\n", - "exploded_df[\"subquery\"] = exploded_df[\"subquery\"].map(lambda x: x.strip())\n", - "exploded_df = exploded_df[[\"query_id\", \"query\", \"subquery\"]].reset_index(drop=True)\n", - "\n", - "print(\"\\nExploded subqueries LazyFrame:\")\n", - "print(exploded_df.show())" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "7cd48f7d", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "======================================================================\n", - "Step 3: Semantic join - find relevant docs for each subquery\n", - "======================================================================\n", - "\n", - "Retrieval LazyFrame:\n", - "sem_join(join_instruction='{subquery:left} can be answered using information from {content:right}', how='inner', suffix='_match', return_explanations=False, default=True, safe_mode=False)\n", - " -- current pipeline\n", - " -- reset_index(drop=True)\n", - " -- [['query_id', 'query', 'subquery']]\n", - " -- ['subquery'] = ...\n", - " -- rename(columns={'subquery_list': 'subquery'})\n", - " -- reset_index(drop=True)\n", - " -- explode('subquery_list')\n", - " -- ['subquery_list'] = ...\n", - " -- sem_map('Break down {query} into 2-3 simpler, focused subqu...')\n", - " -- Source(bound=True)\n", - " -- right pipeline\n", - " Source(bound=True)\n" - ] - } - ], - "source": [ - "print(\"\\n\" + \"=\" * 70)\n", - "print(\"Step 3: Semantic join - find relevant docs for each subquery\")\n", - "print(\"=\" * 70)\n", - "\n", - "# Build the join LazyFrame - using LazyFrame as right side\n", - "retrieval_df = exploded_df.sem_join(\n", - " corpus_lf, # Pass LazyFrame to sem_join\n", - " \"{subquery:left} can be answered using information from {content:right}\",\n", - " suffix=\"_match\",\n", - ")\n", - "\n", - "print(\"\\nRetrieval LazyFrame:\")\n", - "print(retrieval_df.show())" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "8b4864c5", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "======================================================================\n", - "Step 4: Aggregate docs per original query\n", - "======================================================================\n", - "\n", - "Aggregated LazyFrame:\n", - "reset_index()\n", - " -- apply(, include_groups=False)\n", - " -- groupby('query_id')\n", - " -- sem_join(join_instruction='{subquery:left} can be answered using information from {content:right}', how='inner', suffix='_match', return_explanations=False, default=True, safe_mode=False)\n", - " -- current pipeline\n", - " -- reset_index(drop=True)\n", - " -- [['query_id', 'query', 'subquery']]\n", - " -- ['subquery'] = ...\n", - " -- rename(columns={'subquery_list': 'subquery'})\n", - " -- reset_index(drop=True)\n", - " -- explode('subquery_list')\n", - " -- ['subquery_list'] = ...\n", - " -- sem_map('Break down {query} into 2-3 simpler, focused subqu...')\n", - " -- Source(bound=True)\n", - " -- right pipeline\n", - " Source(bound=True)\n" - ] - } - ], - "source": [ - "print(\"\\n\" + \"=\" * 70)\n", - "print(\"Step 4: Aggregate docs per original query\")\n", - "print(\"=\" * 70)\n", - "\n", - "\n", - "def aggregate_docs(group):\n", - " \"\"\"Aggregate retrieved documents per query.\"\"\"\n", - " unique_docs = group.drop_duplicates(subset=[\"doc_id\"])\n", - " docs_text = \"\\n\\n---\\n\\n\".join(unique_docs[\"content\"].tolist())\n", - " return pd.Series({\"query\": group[\"query\"].iloc[0], \"num_docs\": len(unique_docs), \"aggregated_docs\": docs_text})\n", - "\n", - "\n", - "aggregated_df = retrieval_df.groupby(\"query_id\").apply(aggregate_docs, include_groups=False).reset_index()\n", - "\n", - "print(\"\\nAggregated LazyFrame:\")\n", - "print(aggregated_df.show())" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "835706fb", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "======================================================================\n", - "Step 5: Answer queries using retrieved docs\n", - "======================================================================\n", - "\n", - "Answer generation LazyFrame:\n", - "sem_map('Using the following documents:\\n\\n{aggregated_docs}\\n...')\n", - " -- reset_index()\n", - " -- apply(, include_groups=False)\n", - " -- groupby('query_id')\n", - " -- sem_join(join_instruction='{subquery:left} can be answered using information from {content:right}', how='inner', suffix='_match', return_explanations=False, default=True, safe_mode=False)\n", - " -- current pipeline\n", - " -- reset_index(drop=True)\n", - " -- [['query_id', 'query', 'subquery']]\n", - " -- ['subquery'] = ...\n", - " -- rename(columns={'subquery_list': 'subquery'})\n", - " -- reset_index(drop=True)\n", - " -- explode('subquery_list')\n", - " -- ['subquery_list'] = ...\n", - " -- sem_map('Break down {query} into 2-3 simpler, focused subqu...')\n", - " -- Source(bound=True)\n", - " -- right pipeline\n", - " Source(bound=True)\n" - ] - } - ], - "source": [ - "print(\"\\n\" + \"=\" * 70)\n", - "print(\"Step 5: Answer queries using retrieved docs\")\n", - "print(\"=\" * 70)\n", - "\n", - "answer_df = aggregated_df.sem_map(\n", - " \"Using the following documents:\\n\\n{aggregated_docs}\\n\\n\"\n", - " \"Please provide a comprehensive answer to this question: {query}\\n\\n\"\n", - " \"Synthesize information from all relevant documents to give a complete answer.\",\n", - " suffix=\"_answer\",\n", - ")\n", - "\n", - "print(\"\\nAnswer generation LazyFrame:\")\n", - "print(answer_df.show())" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "7147aed8", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "======================================================================\n", - "Step 6: Execute the LazyFrame pipeline\n", - "======================================================================\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Mapping: 100%|██████████ 2/2 LM calls [00:01<00:00, 1.01it/s]\n", - "Join comparisons: 100%|██████████ 60/60 LM Calls [00:03<00:00, 19.95it/s]\n", - "Mapping: 100%|██████████ 2/2 LM calls [00:05<00:00, 2.70s/it]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Execution complete! Results:\n", - " query_id query num_docs \\\n", - "0 1 What are the health benefits of exercise and w... 4 \n", - "1 2 How does climate change affect biodiversity an... 5 \n", - "\n", - " aggregated_docs \\\n", - "0 Regular exercise improves cardiovascular healt... \n", - "1 Climate change is causing rising temperatures ... \n", - "\n", - " _answer \n", - "0 Regular exercise offers significant health ben... \n", - "1 Climate change significantly affects biodivers... \n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "source": [ - "print(\"\\n\" + \"=\" * 70)\n", - "print(\"Step 6: Execute the LazyFrame pipeline\")\n", - "print(\"=\" * 70)\n", - "\n", - "# Execute the LazyFrame - this runs all the operations we've built\n", - "result_df = answer_df.execute({queries_lf: queries_df, corpus_lf: corpus_df})\n", - "\n", - "print(\"\\nExecution complete! Results:\")\n", - "print(result_df)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "47446a74", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'query_id': 1,\n", - " 'query': 'What are the health benefits of exercise and what types of exercise are best for beginners?',\n", - " 'num_docs': 4,\n", - " 'aggregated_docs': \"Regular exercise improves cardiovascular health by strengthening the heart muscle and improving blood circulation. It reduces the risk of heart disease and stroke.\\n\\n---\\n\\nWalking is an excellent low-impact exercise for beginners. It requires no special equipment and can be done anywhere. Start with 15-20 minutes daily.\\n\\n---\\n\\nStrength training helps build muscle mass and increases metabolism. Beginners should start with bodyweight exercises like squats and push-ups.\\n\\n---\\n\\nSwimming is a great full-body workout that's easy on the joints. It's ideal for beginners and those with mobility issues.\",\n", - " '_answer': 'Regular exercise offers significant health benefits, particularly for cardiovascular health. It strengthens the heart muscle and improves blood circulation, which helps reduce the risk of heart disease and stroke.\\n\\nFor beginners, several types of exercise are especially suitable:\\n\\n- **Walking**: This is an excellent low-impact exercise that requires no special equipment and can be done anywhere. Beginners are encouraged to start with 15-20 minutes daily, making it an accessible way to begin an exercise routine.\\n\\n- **Strength Training**: Building muscle mass through strength training also increases metabolism. Beginners should focus on bodyweight exercises such as squats and push-ups, which are effective and do not require equipment.\\n\\n- **Swimming**: Swimming provides a full-body workout and is gentle on the joints, making it ideal for beginners and individuals with mobility issues.\\n\\nIn summary, regular exercise improves heart health and reduces disease risk. Walking, bodyweight strength training, and swimming are all excellent choices for those new to exercise, offering a combination of accessibility, safety, and effectiveness.'},\n", - " {'query_id': 2,\n", - " 'query': 'How does climate change affect biodiversity and what can be done to protect endangered species?',\n", - " 'num_docs': 5,\n", - " 'aggregated_docs': 'Climate change is causing rising temperatures that disrupt ecosystems worldwide. Many species cannot adapt quickly enough to survive.\\n\\n---\\n\\nHabitat loss due to climate change is a major threat to biodiversity. Polar bears, coral reefs, and rainforest species are particularly vulnerable.\\n\\n---\\n\\nConservation efforts like protected areas and wildlife corridors help endangered species survive climate impacts.\\n\\n---\\n\\nReducing carbon emissions is crucial for protecting biodiversity. Transitioning to renewable energy can slow climate change effects.\\n\\n---\\n\\nCommunity-based conservation programs involve local people in protecting endangered species and their habitats.',\n", - " '_answer': 'Climate change significantly affects biodiversity by causing rising temperatures that disrupt ecosystems around the world. Many species are unable to adapt quickly enough to these rapid changes, leading to declines in populations and, in some cases, extinction. Habitat loss is a major threat resulting from climate change, as shifting temperatures and weather patterns make certain environments uninhabitable for the species that depend on them. Particularly vulnerable groups include polar bears, which rely on sea ice; coral reefs, which are sensitive to temperature changes and ocean acidification; and rainforest species, which are threatened by both warming and changing rainfall patterns.\\n\\nTo protect endangered species from the impacts of climate change, several strategies are essential:\\n\\n1. **Conservation Efforts:** Establishing protected areas and creating wildlife corridors can help endangered species survive by preserving critical habitats and allowing animals to move to more suitable environments as conditions change.\\n\\n2. **Reducing Carbon Emissions:** Addressing the root cause of climate change is crucial. Transitioning to renewable energy sources and reducing greenhouse gas emissions can slow the effects of climate change, giving species and ecosystems more time to adapt.\\n\\n3. **Community-Based Conservation:** Involving local communities in conservation programs helps protect endangered species and their habitats. These programs leverage local knowledge and foster stewardship, making conservation efforts more effective and sustainable.\\n\\nIn summary, climate change threatens biodiversity by altering habitats and outpacing the ability of many species to adapt. Protecting endangered species requires a combination of habitat conservation, emission reductions, and community involvement to mitigate these impacts and preserve the planet’s rich diversity of life.'}]" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# View results as a list of dictionaries\n", - "result_df.to_dict(orient=\"records\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.4" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/examples/web_search_examples/tavily_topk.py b/examples/web_search_examples/tavily_topk.py index 3a16744c..cc0cfeb7 100644 --- a/examples/web_search_examples/tavily_topk.py +++ b/examples/web_search_examples/tavily_topk.py @@ -6,8 +6,8 @@ lotus.settings.configure(lm=lm) -df = web_search(WebSearchCorpus.TAVILY, "AI ethics in 2025", 10)[["title", "summary"]] +df = web_search(WebSearchCorpus.TAVILY, "AI ethics in 2025", 10)[["title", "content"]] print(f"Results from Tavily:\n{df}\n") -top_tavily_articles = df.sem_topk("Which {summary} best explains ethical concerns in AI?", K=3) +top_tavily_articles = df.sem_topk("Which {content} best explains ethical concerns in AI?", K=3) print(f"Top 3 articles from Tavily on AI ethics:\n{top_tavily_articles}") diff --git a/examples/web_search_examples/you_topk.py b/examples/web_search_examples/you_topk.py index a53244e0..1258283c 100644 --- a/examples/web_search_examples/you_topk.py +++ b/examples/web_search_examples/you_topk.py @@ -6,8 +6,8 @@ lotus.settings.configure(lm=lm) -df = web_search(WebSearchCorpus.YOU, "latest AI breakthroughs", 10)[["title", "snippet"]] +df = web_search(WebSearchCorpus.YOU, "latest AI breakthroughs", 10)[["title", "snippets"]] print(f"Results from You.com:\n{df}\n") -top_you_articles = df.sem_topk("Which {snippet} is the most groundbreaking?", K=3) +top_you_articles = df.sem_topk("Which {snippets} is the most groundbreaking?", K=3) print(f"Top 3 most interesting articles from You.com:\n{top_you_articles}") From fd6b8cfeca2a8d3a0f74af92019c0dad329c7a7d Mon Sep 17 00:00:00 2001 From: Harshit Gupta Date: Thu, 23 Apr 2026 23:31:53 -0700 Subject: [PATCH 3/3] fix tests --- .github/tests/multimodality_tests.py | 14 +++++++------- lotus/ast/lazyframe.py | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/tests/multimodality_tests.py b/.github/tests/multimodality_tests.py index db566a38..84833443 100644 --- a/.github/tests/multimodality_tests.py +++ b/.github/tests/multimodality_tests.py @@ -70,7 +70,7 @@ def test_filter_operation(setup_models, model): "https://thumbs.dreamstime.com/b/comida-r%C3%A1pida-nachos-con-el-sause-del-tomate-ejemplo-exhausto-de-la-acuarela-mano-aislado-en-blanco-150936354.jpg", "https://i1.wp.com/www.alloverthemap.net/wp-content/uploads/2014/02/2012-09-25-12.46.15.jpg?resize=400%2C284&ssl=1", "https://i.pinimg.com/236x/a4/3a/65/a43a65683a0314f29b66402cebdcf46d.jpg", - "https://pravme.ru/wp-content/uploads/2018/01/sobor-Bogord-1.jpg", + "https://images.pexels.com/photos/45201/kitty-cat-kitten-pet-45201.jpeg?auto=compress&cs=tinysrgb&w=600", ] df = pd.DataFrame({"image": ImageArray(image_url)}) user_instruction = "{image} represents food" @@ -96,7 +96,7 @@ def test_join_operation(setup_models, model): "https://img.etsystatic.com/il/4bee20/1469037676/il_340x270.1469037676_iiti.jpg?version=0", "https://i1.wp.com/www.alloverthemap.net/wp-content/uploads/2014/02/2012-09-25-12.46.15.jpg?resize=400%2C284&ssl=1", "https://i.pinimg.com/236x/a4/3a/65/a43a65683a0314f29b66402cebdcf46d.jpg", - "https://pravme.ru/wp-content/uploads/2018/01/sobor-Bogord-1.jpg", + "https://images.pexels.com/photos/45201/kitty-cat-kitten-pet-45201.jpeg?auto=compress&cs=tinysrgb&w=600", ] elements = ["doll", "bird"] image_df = pd.DataFrame({"image": ImageArray(image_url)}) @@ -123,14 +123,14 @@ def test_topk_operation(setup_models, model): "https://thumbs.dreamstime.com/b/comida-r%C3%A1pida-nachos-con-el-sause-del-tomate-ejemplo-exhausto-de-la-acuarela-mano-aislado-en-blanco-150936354.jpg", "https://i1.wp.com/www.alloverthemap.net/wp-content/uploads/2014/02/2012-09-25-12.46.15.jpg?resize=400%2C284&ssl=1", "https://i.pinimg.com/236x/a4/3a/65/a43a65683a0314f29b66402cebdcf46d.jpg", - "https://pravme.ru/wp-content/uploads/2018/01/sobor-Bogord-1.jpg", + "https://images.pexels.com/photos/45201/kitty-cat-kitten-pet-45201.jpeg?auto=compress&cs=tinysrgb&w=600", ] df = pd.DataFrame({"image": ImageArray(image_url)}) user_instruction = "{image} represents living beings" top_2_expected = set( [ "https://i.pinimg.com/236x/a4/3a/65/a43a65683a0314f29b66402cebdcf46d.jpg", - "https://pravme.ru/wp-content/uploads/2018/01/sobor-Bogord-1.jpg", + "https://images.pexels.com/photos/45201/kitty-cat-kitten-pet-45201.jpeg?auto=compress&cs=tinysrgb&w=600", ] ) @@ -148,7 +148,7 @@ def test_topk_with_groupby_operation(setup_models, model): "https://img.etsystatic.com/il/4bee20/1469037676/il_340x270.1469037676_iiti.jpg?version=0", "https://i1.wp.com/www.alloverthemap.net/wp-content/uploads/2014/02/2012-09-25-12.46.15.jpg?resize=400%2C284&ssl=1", "https://i.pinimg.com/236x/a4/3a/65/a43a65683a0314f29b66402cebdcf46d.jpg", - "https://pravme.ru/wp-content/uploads/2018/01/sobor-Bogord-1.jpg", + "https://images.pexels.com/photos/45201/kitty-cat-kitten-pet-45201.jpeg?auto=compress&cs=tinysrgb&w=600", ] elements = ["doll", "bird"] image_df = pd.DataFrame({"image": ImageArray(image_url)}) @@ -169,7 +169,7 @@ def test_search_operation(setup_models, model): "https://img.etsystatic.com/il/4bee20/1469037676/il_340x270.1469037676_iiti.jpg?version=0", "https://i1.wp.com/www.alloverthemap.net/wp-content/uploads/2014/02/2012-09-25-12.46.15.jpg?resize=400%2C284&ssl=1", "https://i.pinimg.com/236x/a4/3a/65/a43a65683a0314f29b66402cebdcf46d.jpg", - "https://pravme.ru/wp-content/uploads/2018/01/sobor-Bogord-1.jpg", + "https://images.pexels.com/photos/45201/kitty-cat-kitten-pet-45201.jpeg?auto=compress&cs=tinysrgb&w=600", ] expected_result = set(["https://i.pinimg.com/236x/a4/3a/65/a43a65683a0314f29b66402cebdcf46d.jpg"]) @@ -190,7 +190,7 @@ def test_sim_join_operation_image_index(setup_models, model): "https://img.etsystatic.com/il/4bee20/1469037676/il_340x270.1469037676_iiti.jpg?version=0", "https://i1.wp.com/www.alloverthemap.net/wp-content/uploads/2014/02/2012-09-25-12.46.15.jpg?resize=400%2C284&ssl=1", "https://i.pinimg.com/236x/a4/3a/65/a43a65683a0314f29b66402cebdcf46d.jpg", - "https://pravme.ru/wp-content/uploads/2018/01/sobor-Bogord-1.jpg", + "https://images.pexels.com/photos/45201/kitty-cat-kitten-pet-45201.jpeg?auto=compress&cs=tinysrgb&w=600", ] elements = ["doll", "bird"] diff --git a/lotus/ast/lazyframe.py b/lotus/ast/lazyframe.py index 146bbc08..7891ea24 100644 --- a/lotus/ast/lazyframe.py +++ b/lotus/ast/lazyframe.py @@ -862,7 +862,7 @@ def optimize( if not all_optimizers: lotus.logger.warning("LazyFrame.optimize: no optimizers provided, returning original LazyFrame") - return + return self if inplace else self.copy() lotus.logger.debug( f"LazyFrame.optimize: {len(self._nodes)} nodes, " f"{len(all_optimizers)} optimizer(s), inplace={inplace}"