perf: fallback to brute force FTS if filters matching fewer rows (lance-format#4551)

BubbleCal · web-flow · commit 7480c9cb2d12 · 2025-08-24T17:54:42.000+08:00
if the filters match only a few rows, it may cause the WAND fails to
filter out docs.
So we can evaluate only the matched rows, which would be much faster
than running WAND first, this means to decompress at most
`num_rows_matched * num_tokens` blocks

---------

Signed-off-by: BubbleCal &lt;bubble-cal@outlook.com&gt;
diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py
@@ -721,6 +721,38 @@ def test_fts_score(tmp_path):
     assert results["id"].to_pylist() == [3, 2, 1]
 
 
+def test_fts_with_filter(tmp_path):
+    data = pa.table(
+        {
+            "id": [1, 2, 3],
+            "text": ["lance database test", "full text search", "lance search text"],
+        }
+    )
+    ds = lance.write_dataset(data, tmp_path)
+    ds.create_scalar_index("id", "BTREE")
+    ds.create_scalar_index("text", "INVERTED")
+
+    results = ds.to_table(full_text_query="lance search text")
+    assert results.num_rows == 3
+    assert results["id"].to_pylist() == [3, 2, 1]
+
+    score_id1 = results.column("_score")[2].as_py()
+
+    results = ds.to_table(
+        full_text_query="lance search text",
+        filter="id <= 1",
+        prefilter=True,
+    )
+    assert results.num_rows == 1
+    assert results["id"].to_pylist() == [1]
+    assert results.column("_score")[0].as_py() == score_id1
+
+    plan = ds.scanner(
+        full_text_query="lance search text", filter="id <= 1", prefilter=True
+    ).analyze_plan()
+    assert "index_comparisons=1" in plan
+
+
 def test_fts_on_list(tmp_path):
     data = pa.table(
         {
diff --git a/rust/lance-index/src/scalar/inverted/index.rs b/rust/lance-index/src/scalar/inverted/index.rs
@@ -6,7 +6,6 @@ use std::sync::Arc;
 use std::{
     cmp::{min, Reverse},
     collections::BinaryHeap,
-    ops::RangeInclusive,
 };
 use std::{
     collections::{HashMap, HashSet},
@@ -159,6 +158,7 @@ impl InvertedIndex {
             return Ok((Vec::new(), Vec::new()));
         }
         let mask = prefilter.mask();
+
         let mut candidates = BinaryHeap::new();
         let parts = self
             .partitions
@@ -390,6 +390,7 @@ impl ScalarIndex for InvertedIndex {
                     .buffer_unordered(store.io_parallelism())
                     .try_collect::<Vec<_>>()
                     .await?;
+
                 let tokenizer = params.build()?;
                 Ok(Arc::new(Self {
                     params,
@@ -1738,6 +1739,9 @@ impl Ord for RawDocInfo {
 pub struct DocSet {
     row_ids: Vec<u64>,
     num_tokens: Vec<u32>,
+    // (row_id, doc_id) pairs sorted by row_id
+    inv: Vec<(u64, u32)>,
+
     total_tokens: u64,
 }
 
@@ -1759,8 +1763,19 @@ impl DocSet {
         self.row_ids[doc_id as usize]
     }
 
-    pub fn row_range(&self) -> RangeInclusive<u64> {
-        self.row_ids[0]..=self.row_ids[self.len() - 1]
+    pub fn doc_id(&self, row_id: u64) -> Option<u64> {
+        if self.inv.is_empty() {
+            // in legacy format, the row id is doc id
+            match self.row_ids.binary_search(&row_id) {
+                Ok(_) => Some(row_id),
+                Err(_) => None,
+            }
+        } else {
+            match self.inv.binary_search_by_key(&row_id, |x| x.0) {
+                Ok(idx) => Some(self.inv[idx].1 as u64),
+                Err(_) => None,
+            }
+        }
     }
 
     pub fn total_tokens_num(&self) -> u64 {
@@ -1829,25 +1844,30 @@ impl DocSet {
         let row_id_col = batch[ROW_ID].as_primitive::<datatypes::UInt64Type>();
         let num_tokens_col = batch[NUM_TOKEN_COL].as_primitive::<datatypes::UInt32Type>();
 
-        let (row_ids, num_tokens) = match is_legacy {
+        let (row_ids, num_tokens, inv) = match is_legacy {
             // for legacy format, the row id is doc id,
             // in order to support efficient search, we need to sort the row ids,
             // so that we can use binary search to get num_tokens
-            true => row_id_col
-                .values()
-                .iter()
-                .filter_map(|id| {
-                    if let Some(frag_reuse_index_ref) = frag_reuse_index.as_ref() {
-                        frag_reuse_index_ref.remap_row_id(*id)
-                    } else {
-                        Some(*id)
-                    }
-                })
-                .zip(num_tokens_col.values().iter())
-                .sorted_unstable_by_key(|x| x.0)
-                .unzip(),
+            true => {
+                let (row_ids, num_tokens) = row_id_col
+                    .values()
+                    .iter()
+                    .filter_map(|id| {
+                        if let Some(frag_reuse_index_ref) = frag_reuse_index.as_ref() {
+                            frag_reuse_index_ref.remap_row_id(*id)
+                        } else {
+                            Some(*id)
+                        }
+                    })
+                    .zip(num_tokens_col.values().iter())
+                    .sorted_unstable_by_key(|x| x.0)
+                    .unzip();
+
+                // the legacy format doesn't need to store the inv
+                (row_ids, num_tokens, Vec::new())
+            }
             false => {
-                let row_ids = row_id_col
+                let row_ids: Vec<u64> = row_id_col
                     .values()
                     .iter()
                     .filter_map(|id| {
@@ -1859,14 +1879,24 @@ impl DocSet {
                     })
                     .collect();
                 let num_tokens = num_tokens_col.values().to_vec();
-                (row_ids, num_tokens)
+
+                // build the inv
+                let inv = row_ids
+                    .iter()
+                    .copied()
+                    .enumerate()
+                    .sorted_unstable()
+                    .map(|(i, row_id)| (row_id, i as u32))
+                    .collect();
+                (row_ids, num_tokens, inv)
             }
         };
 
         let total_tokens = num_tokens.iter().map(|&x| x as u64).sum();
         Ok(Self {
             row_ids,
             num_tokens,
+            inv,
             total_tokens,
         })
     }
@@ -1901,6 +1931,8 @@ impl DocSet {
         self.num_tokens[doc_id as usize]
     }
 
+    // this can be used only if it's a legacy format,
+    // which store the sorted row ids so that we can use binary search
     #[inline]
     pub fn num_tokens_by_row_id(&self, row_id: u64) -> u32 {
         self.row_ids
diff --git a/rust/lance-index/src/scalar/inverted/wand.rs b/rust/lance-index/src/scalar/inverted/wand.rs
@@ -9,6 +9,8 @@ use arrow::array::AsArray;
 use arrow::datatypes::{Int32Type, UInt32Type};
 use arrow_array::{Array, UInt32Array};
 use arrow_schema::DataType;
+use itertools::Itertools;
+use lance_core::utils::address::RowAddress;
 use lance_core::utils::mask::RowIdMask;
 use lance_core::Result;
 
@@ -321,17 +323,26 @@ impl<'a, S: Scorer> Wand<'a, S> {
             return Ok(vec![]);
         }
 
+        let avg_posting_length =
+            self.postings.iter().map(|p| p.list.len()).sum::<usize>() / self.postings.len();
+        match (mask.max_len(), mask.iter_ids()) {
+            (Some(num_rows_matched), Some(row_ids))
+                if num_rows_matched <= avg_posting_length as u64 =>
+            {
+                return self.flat_search(params, row_ids, metrics);
+            }
+            _ => {}
+        }
+
         let mut candidates = BinaryHeap::new();
         let mut num_comparisons = 0;
         while let Some((pivot, doc)) = self.next()? {
             self.cur_doc = Some(doc);
             num_comparisons += 1;
 
-            // if the doc is not located, we need to find the row id
             let row_id = match &doc {
                 DocInfo::Raw(doc) => {
                     // if the doc is not located, we need to find the row id
-                    // in the doc set. This is a bit slow, but it should be rare.
                     self.docs.row_id(doc.doc_id)
                 }
                 DocInfo::Located(doc) => doc.row_id,
@@ -379,6 +390,90 @@ impl<'a, S: Scorer> Wand<'a, S> {
             .collect())
     }
 
+    fn flat_search(
+        &mut self,
+        params: &FtsSearchParams,
+        row_ids: Box<dyn Iterator<Item = RowAddress> + '_>,
+        metrics: &dyn MetricsCollector,
+    ) -> Result<Vec<DocCandidate>> {
+        let limit = params.limit.unwrap_or(usize::MAX);
+        if limit == 0 {
+            return Ok(vec![]);
+        }
+
+        // we need to map the row ids to doc ids, and sort them,
+        // because WAND PostingIterator can't go back to the previous doc id
+        let doc_ids = row_ids
+            .filter_map(|row_addr| {
+                let row_id: u64 = row_addr.into();
+                self.docs.doc_id(row_id).map(|doc_id| (doc_id, row_id))
+            })
+            .sorted_unstable()
+            .collect::<Vec<_>>();
+        let is_compressed = matches!(self.postings[0].list, PostingList::Compressed(_));
+
+        let mut num_comparisons = 0;
+        let mut candidates = BinaryHeap::new();
+        for (doc_id, row_id) in doc_ids {
+            num_comparisons += 1;
+
+            // move all postings to this doc id
+            self.move_preceding(self.postings.len() - 1, doc_id);
+            if self.postings.is_empty() {
+                // no more postings, so we can stop
+                break;
+            } else if self.postings[0].doc().map(|d| d.doc_id()) != Some(doc_id) {
+                // this doc is not in the postings, so we can skip it
+                continue;
+            }
+
+            let mut pivot = 0;
+            while pivot + 1 < self.postings.len()
+                && self.postings[pivot + 1].doc().map(|d| d.doc_id()) == Some(doc_id)
+            {
+                pivot += 1;
+            }
+
+            // check positions
+            if params.phrase_slop.is_some()
+                && !self.check_positions(params.phrase_slop.unwrap() as i32)
+            {
+                continue;
+            }
+
+            // score the doc
+            let doc_length = match is_compressed {
+                true => self.docs.num_tokens(doc_id as u32),
+                false => self.docs.num_tokens_by_row_id(row_id),
+            };
+
+            let score = self.score(pivot, doc_length);
+            let freqs = self
+                .iter_token_freqs(pivot)
+                .map(|(token, freq)| (token.to_owned(), freq))
+                .collect();
+
+            if candidates.len() < limit {
+                candidates.push(Reverse((ScoredDoc::new(row_id, score), freqs, doc_length)));
+            } else if score > candidates.peek().unwrap().0 .0.score.0 {
+                candidates.pop();
+                candidates.push(Reverse((ScoredDoc::new(row_id, score), freqs, doc_length)));
+                self.threshold = candidates.peek().unwrap().0 .0.score.0 * params.wand_factor;
+            }
+        }
+        metrics.record_comparisons(num_comparisons);
+
+        Ok(candidates
+            .into_sorted_vec()
+            .into_iter()
+            .map(|Reverse((doc, freqs, doc_length))| DocCandidate {
+                row_id: doc.row_id,
+                freqs,
+                doc_length,
+            })
+            .collect())
+    }
+
     // calculate the score of the current document
     fn score(&self, pivot: usize, doc_length: u32) -> f32 {
         let mut score = 0.0;