From 1b10f5e90e54b90e6ce6aa8c387b481d014f2894 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Mon, 13 Jan 2025 11:14:46 -0500 Subject: [PATCH 1/7] Add github action to codespell main on push and PRs --- .github/workflows/codespell.yml | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 .github/workflows/codespell.yml diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml new file mode 100644 index 00000000..b2316674 --- /dev/null +++ b/.github/workflows/codespell.yml @@ -0,0 +1,25 @@ +# Codespell configuration is within pyproject.toml +--- +name: Codespell + +on: + push: + branches: [main] + pull_request: + branches: [main] + +permissions: + contents: read + +jobs: + codespell: + name: Check for spelling errors + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Annotate locations with typos + uses: codespell-project/codespell-problem-matcher@v1 + - name: Codespell + uses: codespell-project/actions-codespell@v2 From 674b3dc6ed44190d3ef0ee6915630fc18dc970f1 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Mon, 13 Jan 2025 11:14:46 -0500 Subject: [PATCH 2/7] Add rudimentary codespell config --- pyproject.toml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e0e09be0..821b21a1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,4 +74,10 @@ line-ending = "auto" [tool.mypy] python_version = "3.10" strict = true -ignore_missing_imports = true \ No newline at end of file +ignore_missing_imports = true +[tool.codespell] +# Ref: https://github.com/codespell-project/codespell#using-a-config-file +skip = '.git*' +check-hidden = true +# ignore-regex = '' +# ignore-words-list = '' From c3e30b118c367bc744c55ec2da7b65222c428959 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Mon, 13 Jan 2025 11:14:46 -0500 Subject: [PATCH 3/7] Add pre-commit definition for codespell --- .pre-commit-config.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0964e87f..ec83271e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,3 +19,11 @@ repos: - sentence-transformers>=3.0.1 - tiktoken>=0.7.0 - tqdm>=4.66.4 + + - repo: https://github.com/codespell-project/codespell + # Configuration for codespell is in pyproject.toml + rev: v2.3.0 + hooks: + - id: codespell + additional_dependencies: + - tomli From 56d5c634871b6c30096eecc884f6c4a0a3e5af73 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Mon, 13 Jan 2025 11:16:19 -0500 Subject: [PATCH 4/7] Adjust codespell config for some names etc --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 821b21a1..5e33f39b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -79,5 +79,5 @@ ignore_missing_imports = true # Ref: https://github.com/codespell-project/codespell#using-a-config-file skip = '.git*' check-hidden = true -# ignore-regex = '' -# ignore-words-list = '' +ignore-regex = '\bParth\b' +ignore-words-list = 'ans' From 7241c297639ad7f9e2b695d4b4357486ef9bd69c Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Mon, 13 Jan 2025 11:17:00 -0500 Subject: [PATCH 5/7] [DATALAD RUNCMD] Do interactive fixing of some ambigous typos === Do not change lines below === { "chain": [], "cmd": "codespell -w -i 3 -C 2 ./lotus/utils.py", "exit": 0, "extra_inputs": [], "inputs": [], "outputs": [], "pwd": "." } ^^^ Do not change lines above ^^^ --- lotus/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lotus/utils.py b/lotus/utils.py index b3e68ac9..6f38ff6f 100644 --- a/lotus/utils.py +++ b/lotus/utils.py @@ -56,7 +56,7 @@ def ret( rm.load_index(col_index_dir) assert rm.index_dir == col_index_dir - ids = df.index.tolist() # assumes df index hasn't been resest and corresponds to faiss index ids + ids = df.index.tolist() # assumes df index hasn't been reset and corresponds to faiss index ids vec_set = rm.get_vectors_from_index(col_index_dir, ids) d = vec_set.shape[1] kmeans = faiss.Kmeans(d, ncentroids, niter=niter, verbose=verbose) From a4d79a0682ced379c253e191d7ad8165811fe839 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Mon, 13 Jan 2025 11:17:05 -0500 Subject: [PATCH 6/7] [DATALAD RUNCMD] run codespell throughout fixing few left typos automagically === Do not change lines below === { "chain": [], "cmd": "codespell -w", "exit": 0, "extra_inputs": [], "inputs": [], "outputs": [], "pwd": "." } ^^^ Do not change lines above ^^^ --- README.md | 4 ++-- docs/approximation_cascades.rst | 4 ++-- docs/configurations.rst | 2 +- docs/core_concepts.rst | 6 +++--- docs/multimodal_models.rst | 2 +- docs/sem_index.rst | 2 +- docs/sem_map.rst | 2 +- docs/sem_sim_join.rst | 2 +- docs/sem_topk.rst | 2 +- examples/model_examples/cache.py | 2 +- lotus/models/reranker.py | 2 +- lotus/sem_ops/postprocessors.py | 2 +- lotus/sem_ops/sem_join.py | 2 +- 13 files changed, 17 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index f01ae332..6a12bdc6 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ LOTUS makes LLM-powered data processing fast and easy. -LOTUS (**L**LMs **O**ver **T**ables of **U**nstructured and **S**tructured Data) provides a declarative programming model and an optimized query engine for serving powerful reasoning-based query pipelines over structured and unstructured data! We provide a simple and intuitive Pandas-like API, that implements **semantic operators**. +LOTUS (**L**LMs **O**ver **T**ables of **U**nstructured and **S**structured Data) provides a declarative programming model and an optimized query engine for serving powerful reasoning-based query pipelines over structured and unstructured data! We provide a simple and intuitive Pandas-like API, that implements **semantic operators**. For trouble-shooting or feature requests, please raise an issue and we'll get to it promptly. To share feedback and applications you're working on, you can send us a message on our [community slack](https://join.slack.com/t/lotus-fnm8919/shared_invite/zt-2tnq6948j-juGuSIR0__fsh~kUmZ6TJw), or send an email (lianapat@stanford.edu). @@ -88,7 +88,7 @@ LOTUS offers a number of semantic operators in a Pandas-like API, some of which | sem_filter | Keep records that match the natural language predicate | | sem_extract | Extract one or more attributes from each row | | sem_agg | Aggregate across all records (e.g. for summarization) | -| sem_topk | Order the records by some natural langauge sorting criteria | +| sem_topk | Order the records by some natural language sorting criteria | | sem_join | Join two datasets based on a natural language predicate | | sem_sim_join | Join two DataFrames based on semantic similarity | | sem_search | Perform semantic search the over a text column | diff --git a/docs/approximation_cascades.rst b/docs/approximation_cascades.rst index 9c840b83..43545c35 100644 --- a/docs/approximation_cascades.rst +++ b/docs/approximation_cascades.rst @@ -8,11 +8,11 @@ LOTUS serves approximations for semantic operators to let you balance speed and You can set accurayc targets according to the requirements of your application, and LOTUS will use approximations to optimize the implementation for lower computaitonal overhead, while providing probabilistic accuracy guarantees. One core technique for providing these approximations is the use of cascades. -Cascades provide a way to optimize certian semantic operators (Join Cascade and Filter Cascade) by blending +Cascades provide a way to optimize certain semantic operators (Join Cascade and Filter Cascade) by blending a less costly but potentially inaccurate proxy model with a high-quality oracle model. The method seeks to achieve preset precision and recall targets with a given probability while controlling computational overhead. -Cascades work by intially using a cheap approximation to score and filters/joins tuples. Using statistically +Cascades work by initially using a cheap approximation to score and filters/joins tuples. Using statistically supported thresholds found from sampling prior, it then assigns each tuple to one of three actions based on the proxy's score: accept, reject, or seek clarification from the oracle model. diff --git a/docs/configurations.rst b/docs/configurations.rst index 16cec150..8ac5c728 100644 --- a/docs/configurations.rst +++ b/docs/configurations.rst @@ -22,7 +22,7 @@ Configurable Parameters -------------------------- 1. enable_message_cache: - * Description: Enables or Disables cahcing mechanisms + * Description: Enables or Disables caching mechanisms * Default: False * Parameters: - cache_type: Type of caching (SQLITE or In_MEMORY) diff --git a/docs/core_concepts.rst b/docs/core_concepts.rst index 01a589b4..334a9935 100644 --- a/docs/core_concepts.rst +++ b/docs/core_concepts.rst @@ -2,10 +2,10 @@ Core Concepts ================== LOTUS' implements the semantic operator programming model. Semantic operators are declarative transformations over one or more -datasets, parameterized by a natural langauge expression (*langex*) that can be implemnted by a variety of AI-based algorithms. +datasets, parameterized by a natural language expression (*langex*) that can be implemented by a variety of AI-based algorithms. Semantic operators seamlessly extend the relational model, operating over datasets that may contain traditional structured data as well as unstructured fields, such as free-form text or images. Because semantic operators are composable, modular and declarative, they allow you to write -AI-based piplines with intuitive, high-level logic, leaving the rest of the work to the query engine! Each operator can be implmented and +AI-based pipelines with intuitive, high-level logic, leaving the rest of the work to the query engine! Each operator can be implemented and optimized in multiple ways, opening a rich space for execution plans, similar to relational operators. Here is a quick example of semantic operators in action: .. code-block:: python @@ -28,7 +28,7 @@ Here are some key semantic operators: +--------------+----------------------------------------------------------+ | sem_agg | Aggregate across all records (e.g. for summarization) | +--------------+----------------------------------------------------------+ -| sem_topk | Order records by the natural langauge ranking criteria | +| sem_topk | Order records by the natural language ranking criteria | +--------------+----------------------------------------------------------+ | sem_join | Join two datasets based on a natural language predicate | +--------------+----------------------------------------------------------+ diff --git a/docs/multimodal_models.rst b/docs/multimodal_models.rst index 16ef9c6f..977c5134 100644 --- a/docs/multimodal_models.rst +++ b/docs/multimodal_models.rst @@ -11,7 +11,7 @@ PIL images, numpy arrays, base64 strings, and image URLs Initializing ImageArray ----------------------- The ImageArray class is an extension array designed to handle images as data types in pandas. -You can initilize an ImageArray with a list of supported image formats +You can initialize an ImageArray with a list of supported image formats .. code-block:: python diff --git a/docs/sem_index.rst b/docs/sem_index.rst index 3e7bf744..c70ac628 100644 --- a/docs/sem_index.rst +++ b/docs/sem_index.rst @@ -5,7 +5,7 @@ Overview --------- The sem_index operator in LOTUS creates a semantic index over the specified column in the dataset. This index enables efficient retrieval and ranking of records based on semantic similarity. -The index will be generated with the configured retreival model stored locally in the specified directory. +The index will be generated with the configured retrieval model stored locally in the specified directory. Example diff --git a/docs/sem_map.rst b/docs/sem_map.rst index 589b91b6..b871cf65 100644 --- a/docs/sem_map.rst +++ b/docs/sem_map.rst @@ -3,7 +3,7 @@ sem_map Overview ---------- -This operato performs a semantic projection over an input column. The langex parameter specifies this projection in natural language. +This operator performs a semantic projection over an input column. The langex parameter specifies this projection in natural language. Motivation ----------- diff --git a/docs/sem_sim_join.rst b/docs/sem_sim_join.rst index 79145a15..e6089e4b 100644 --- a/docs/sem_sim_join.rst +++ b/docs/sem_sim_join.rst @@ -3,7 +3,7 @@ sem_sim_join Overview --------- -The similairty join matches tuples from the right and left table according to their semantic similarity, rather than an arbitrary +The similarity join matches tuples from the right and left table according to their semantic similarity, rather than an arbitrary natural-language predicate. Akin to an equi-join in standard relational algebra, the semantic similarity join is a specialized semantic join, can be heavily optimized using the semantic index. diff --git a/docs/sem_topk.rst b/docs/sem_topk.rst index d57f0931..f38f27e5 100644 --- a/docs/sem_topk.rst +++ b/docs/sem_topk.rst @@ -59,7 +59,7 @@ Required Parameters - **user_instruction** : The user instruction for sorting. - **K**: The number of rows to return. -Optional Paramaters +Optional Parameters --------------------- - **method** : The method to use for sorting. Options are "quick", "heap", "naive", "quick-sem". - **group_by** : The columns to group by before sorting. Each group will be sorted separately. diff --git a/examples/model_examples/cache.py b/examples/model_examples/cache.py index 95bc2823..cc3ab95c 100644 --- a/examples/model_examples/cache.py +++ b/examples/model_examples/cache.py @@ -23,7 +23,7 @@ df = pd.DataFrame(data) user_instruction = "{Course Name} requires a lot of math" df = df.sem_filter(user_instruction) -print("====== intial run ======") +print("====== initial run ======") print(df) # run a second time diff --git a/lotus/models/reranker.py b/lotus/models/reranker.py index a7fd5996..c6ffda96 100644 --- a/lotus/models/reranker.py +++ b/lotus/models/reranker.py @@ -19,6 +19,6 @@ def __call__(self, query: str, docs: list[str], K: int) -> RerankerOutput: K (int): The number of documents to keep after reranking. Returns: - RerankerOutput: The indicies of the reranked documents. + RerankerOutput: The indices of the reranked documents. """ pass diff --git a/lotus/sem_ops/postprocessors.py b/lotus/sem_ops/postprocessors.py index d531099c..06d875a2 100644 --- a/lotus/sem_ops/postprocessors.py +++ b/lotus/sem_ops/postprocessors.py @@ -61,7 +61,7 @@ def extract_postprocess(llm_answers: list[str]) -> SemanticExtractPostprocessOut Postprocess the output of the extract operator to extract the schema. Args: - llm_answers (list[str]): The list of llm answers containging the extract. + llm_answers (list[str]): The list of llm answers containing the extract. Returns: SemanticExtractPostprocessOutput diff --git a/lotus/sem_ops/sem_join.py b/lotus/sem_ops/sem_join.py index 0050f499..04a2a5d1 100644 --- a/lotus/sem_ops/sem_join.py +++ b/lotus/sem_ops/sem_join.py @@ -384,7 +384,7 @@ def join_optimizer( int: The number of LM calls from optimizing join plan. """ - # Helper is currently default to similiarity join + # Helper is currently default to similarity join if lotus.settings.helper_lm is not None: lotus.logger.debug("Helper model is not supported yet. Default to similarity join.") From a1cacdfca49e2ade05106d3e0fe79658b375538b Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Mon, 13 Jan 2025 11:18:15 -0500 Subject: [PATCH 7/7] Adjust underlining in rst to reflect typo fix --- docs/sem_topk.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sem_topk.rst b/docs/sem_topk.rst index f38f27e5..21a06bba 100644 --- a/docs/sem_topk.rst +++ b/docs/sem_topk.rst @@ -60,8 +60,8 @@ Required Parameters - **K**: The number of rows to return. Optional Parameters ---------------------- +-------------------- - **method** : The method to use for sorting. Options are "quick", "heap", "naive", "quick-sem". - **group_by** : The columns to group by before sorting. Each group will be sorted separately. - **cascade_threshold**: The confidence threshold for cascading to a larger model. -- **return_stats** : Whether to return stats. \ No newline at end of file +- **return_stats** : Whether to return stats.