From a3f61d90145fc6710debd2edfee9f3de53fcda05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20Gilbert?= Date: Tue, 20 May 2025 17:28:39 +0200 Subject: [PATCH] Search: allow typos when searching by string For experimentation I set to to require exact substring match when the searched string has length <= 4, and allow edit distance <= 2 otherwise. These 2 numbers could be turned into user-settable options. IDK if these are good defaults. Also not sure if constant ints is the right way to work. eg with longer input strings we may want to allow more typos so maybe `2` should be `min 2 (2% of input string length)` or some such thing. Probably needs user feedback to progress. (also no idea how performant this is) --- clib/cString.ml | 49 +++++++++++++++++++ clib/cString.mli | 5 ++ .../20660-search-fuzzy-Added.rst | 5 ++ .../proof-engine/vernacular-commands.rst | 8 +++ test-suite/output/Search.out | 13 +++-- test-suite/output/Search.v | 8 +++ vernac/search.ml | 14 +++++- 7 files changed, 97 insertions(+), 5 deletions(-) create mode 100644 doc/changelog/08-vernac-commands-and-options/20660-search-fuzzy-Added.rst diff --git a/clib/cString.ml b/clib/cString.ml index aae672192d09..db52345ef642 100644 --- a/clib/cString.ml +++ b/clib/cString.ml @@ -143,6 +143,8 @@ let ordinal n = in string_of_int n ^ s +(* this and edit_distance copied from ocaml stdlib *) + let uchar_array_of_utf_8_string s = let slen = length s in (* is an upper bound on Uchar.t count *) let uchars = Array.make slen Uchar.max in @@ -209,6 +211,53 @@ let edit_distance ?(limit = Stdlib.Int.max_int) s0 s1 = let d = loop row_minus2 row_minus1 row 1 len0 limit s0 s1 in if d > limit then limit else d +(* modified from edit_distance to match a substring *) +let edit_distance_substring ?(limit = Stdlib.Int.max_int) ~pattern:s0 s1 = + let[@inline] minimum a b c = Stdlib.Int.min a (Stdlib.Int.min b c) in + (* XXX we should be able to cache s0 + instead recomputing every time we do the check *) + let s0, len0 = uchar_array_of_utf_8_string s0 in + let s1, len1 = uchar_array_of_utf_8_string s1 in + if len0 - len1 >= limit then limit else + let rec loop row_minus2 row_minus1 row i len0 limit s0 s1 = + if i > len0 then Array.fold_left Stdlib.Int.min Stdlib.Int.max_int row_minus1 else + let len1 = Array.length row - 1 in + let row_min = ref Stdlib.Int.max_int in + row.(0) <- i; + (* stdlib has some limit-based shortcuts here but naively + copying them doesn't work for substring matching *) + for j = 1 to len1 do + let cost = if Uchar.equal s0.(i-1) s1.(j-1) then 0 else 1 in + let min = minimum + (row_minus1.(j-1) + cost) (* substitute *) + (row_minus1.(j) + 1) (* delete *) + (row.(j-1) + 1) (* insert *) + in + let min = + if (i > 1 && j > 1 && + Uchar.equal s0.(i-1) s1.(j-2) && + Uchar.equal s0.(i-2) s1.(j-1)) + then Stdlib.Int.min min (row_minus2.(j-2) + cost) (* transpose *) + else min + in + row.(j) <- min; + row_min := Stdlib.Int.min !row_min min; + done; + if !row_min >= limit then (* can no longer decrease *) limit else + loop row_minus1 row row_minus2 (i + 1) len0 limit s0 s1 + in + let ignore = + (* Value used to make the values around the diagonal stripe ignored + by the min computations when we have a limit. *) + (* XXX why is max_int + 1 not a problem? *) + limit + 1 + in + let row_minus2 = Array.make (len1 + 1) ignore in + (* for substring matching row_minus1 starts with constant 0 instead of stdlib (fun x -> x) *) + let row_minus1 = Array.make (len1 + 1) 0 in + let row = Array.make (len1 + 1) ignore in + loop row_minus2 row_minus1 row 1 len0 limit s0 s1 + (* string parsing *) module Self = diff --git a/clib/cString.mli b/clib/cString.mli index 23f06a72a904..60b7d7ddbbcd 100644 --- a/clib/cString.mli +++ b/clib/cString.mli @@ -83,6 +83,11 @@ val edit_distance : ?limit:int -> string -> string -> int copied from ocaml 5.4 *) +val edit_distance_substring : ?limit:int -> pattern:string -> string -> int +(** [edit_distance_substring ?limit ~pattern s1] + is the minimum [edit_distance ?limit pattern s2] + where [s2] is a substring of [s1]. *) + (** {6 Generic operations} **) module Set : CSet.ExtS with type elt = t diff --git a/doc/changelog/08-vernac-commands-and-options/20660-search-fuzzy-Added.rst b/doc/changelog/08-vernac-commands-and-options/20660-search-fuzzy-Added.rst new file mode 100644 index 000000000000..71d628555663 --- /dev/null +++ b/doc/changelog/08-vernac-commands-and-options/20660-search-fuzzy-Added.rst @@ -0,0 +1,5 @@ +- **Added:** + approximate matching in :cmd:`Search`, controlled by new option :opt:`Fuzzy Search`. + Currently only when searching by name (:n:`Search @string`) + (`#20660 `_, + by Gaƫtan Gilbert). diff --git a/doc/sphinx/proof-engine/vernacular-commands.rst b/doc/sphinx/proof-engine/vernacular-commands.rst index c3b86e977eab..d3c50b9a61a9 100644 --- a/doc/sphinx/proof-engine/vernacular-commands.rst +++ b/doc/sphinx/proof-engine/vernacular-commands.rst @@ -162,6 +162,14 @@ described elsewhere put it between single quotes or explicitly provide a scope. See :ref:`this example `. + .. opt:: Fuzzy Search + + Objects whose name contains a string differing by at most the + value of this option (default 2) are also included in the + search output. Set to 0 to search for only exact matches. To + avoid overly verbose results exact matching is always used + when the searched string has length 4 or less. + :n:`hyp:` The provided pattern or reference is matched against any subterm of an hypothesis of the type of the objects. See :ref:`this diff --git a/test-suite/output/Search.out b/test-suite/output/Search.out index b88bb362732e..8704b54029ce 100644 --- a/test-suite/output/Search.out +++ b/test-suite/output/Search.out @@ -318,6 +318,13 @@ andb_true_intro: forall [b1 b2 : bool], b1 = true /\ b2 = true -> (b1 && b2)%bool = true andb_prop: forall a b : bool, (a && b)%bool = true -> a = true /\ b = true andb_prop: forall a b : bool, (a && b)%bool = true -> a = true /\ b = true +and_comm: forall A B : Prop, A /\ B <-> B /\ A +eq_id_comm_r: + forall [A : Type] (f : A -> A) (Hf : forall a : A, f a = a) (a : A), + f_equal f (Hf a) = Hf (f a) +eq_id_comm_l: + forall [A : Type] (f : A -> A) (Hf : forall a : A, a = f a) (a : A), + f_equal f (Hf a) = Hf (f a) h: n <> newdef n h': newdef n <> n h: n <> newdef n @@ -326,13 +333,13 @@ h: n <> newdef n h: n <> newdef n h: n <> newdef n h': newdef n <> n -File "./output/Search.v", line 24, characters 2-23: +File "./output/Search.v", line 32, characters 2-23: The command has indeed failed with message: [Focus] No such goal. -File "./output/Search.v", line 25, characters 2-25: +File "./output/Search.v", line 33, characters 2-25: The command has indeed failed with message: Query commands only support the single numbered goal selector. -File "./output/Search.v", line 26, characters 2-25: +File "./output/Search.v", line 34, characters 2-25: The command has indeed failed with message: Query commands only support the single numbered goal selector. h: P n diff --git a/test-suite/output/Search.v b/test-suite/output/Search.v index 7e405244df40..06ef533f691d 100644 --- a/test-suite/output/Search.v +++ b/test-suite/output/Search.v @@ -7,6 +7,14 @@ Search (@eq _ _ true). Search (@eq _ _ _) true -false. (* andb_prop *) Search (@eq _ _ _) true -false "prop" -"intro". (* andb_prop *) +Search "add_comm". +(* currently set to return results whose name contains a substring + with edit distance <=2 from the pattern: + + - and_comm (mutate the first "d") + - eq_id_comm_r and eq_id_comm_l (mutate the "a" and second "d") +*) + Definition newdef := fun x:nat => x. Goal forall n:nat, n <> newdef n -> newdef n <> n -> False. diff --git a/vernac/search.ml b/vernac/search.ml index fcc9f9db12b9..72cc695d0310 100644 --- a/vernac/search.ml +++ b/vernac/search.ml @@ -210,6 +210,17 @@ let module_filter : _ -> filter_function = fun mods ref kind env sigma typ -> let name_of_reference ref = Id.to_string (Nametab.basename_of_global ref) +let { Goptions.get = search_limit } = + Goptions.declare_int_option_and_ref ~key:["Fuzzy"; "Search"] ~value:2 () + +let string_contains_upto ?limit ~pattern s = + let limit = search_limit() in + (* for small patterns, allowing edits produces bad results *) + if String.length pattern <= 4 || limit = 0 then String.string_contains ~where:s ~what:pattern + else + let d = CString.edit_distance_substring ~limit:(limit+1) ~pattern s in + d <= limit + let search_filter : _ -> filter_function = fun query gr kind env sigma typ -> match query with | GlobSearchSubPattern (where,head,pat) -> let open Context.Rel.Declaration in @@ -227,8 +238,7 @@ let search_filter : _ -> filter_function = fun query gr kind env sigma typ -> ma if head then Constr_matching.is_matching_head else Constr_matching.is_matching_appsubterm ~closed:false in f env sigma pat (EConstr.of_constr typ)) typl -| GlobSearchString s -> - String.string_contains ~where:(name_of_reference gr) ~what:s +| GlobSearchString s -> string_contains_upto ~pattern:s (name_of_reference gr) | GlobSearchKind k -> (match kind with None -> false | Some k' -> k = k') | GlobSearchFilter f -> f gr