Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,25 @@ docker compose build
docker compose run --rm playwright
```

## How to run a PHP script that shows the Solr query

```bash
docker compose run vufind php bin/PrintSolrQuery.php 'charles dickens OR "weekly"' title
```

The output of this script loooks like:

```bash

----- Tokenized Search ----- : ["charles","dickens","OR","\"weekly\""]
----- Classified Tokens ----- : [{"type":"term","value":"charles"},{"type":"term","value":"dickens"},{"type":"operator","value":"OR"},{"type":"phrase","value":{"text":"weekly","slop":null}}]
----- Tokens after collapsing compound phrases ----- : [{"type":"term","value":"charles"},{"type":"compound_phrase","value":{"tokens":[{"type":"term","value":"dickens"},{"type":"operator","value":"OR"},{"type":"phrase","value":{"text":"weekly","slop":null}}]}}]
----- Escaped Parts ----- : ["charles","dickens OR \"weekly\""]
----- Semantic Structure ----- : {"onephrase":"\"charles dickens OR weekly\"","and":"charles AND dickens OR \"weekly\"","or":"charles OR dickens OR \"weekly\"","asis":"charles (dickens OR \"weekly\")","compressed":"charles\\(dickensOR\\\"weekly\\\"\\)","exactmatcher":"charlesdickensorweekly","emstartswith":"charlesdickensorweekly*"}
----- Solr Search ----- : "(title_ab:(charlesdickensorweekly)^25000 OR title_a:(charlesdickensorweekly)^15000 OR titleProper:(charlesdickensorweekly*)^8000 OR titleProper:(\"charles dickens OR weekly\")^1200 OR titleProper:(charles AND dickens OR \"weekly\")^120 OR title_topProper:(\"charles dickens OR weekly\")^600 OR title_topProper:(charles AND dickens OR \"weekly\")^60 OR title_restProper:(\"charles dickens OR weekly\")^400 OR title_restProper:(charles AND dickens OR \"weekly\")^40 OR series:(\"charles dickens OR weekly\")^500 OR series:(charles AND dickens OR \"weekly\")^50 OR series2:(\"charles dickens OR weekly\")^500 OR series2:(charles AND dickens OR \"weekly\")^50 OR title:(charles AND dickens OR \"weekly\")^30 OR title_top:(charles AND dickens OR \"weekly\")^20 OR title_rest:(charles AND dickens OR \"weekly\")^1)"

```

## What Works

See all records with http://localhost:8080/Search/Home
Expand Down
83 changes: 83 additions & 0 deletions bin/PrintSolrQuery.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#!/usr/bin/env php
<?php
/**
* Standalone helper that builds a SearchStructure from a single user query
* string, runs it through Solr::simplesearch, and prints the generated Solr
* search arguments.
*
* This script can be run from the project root.
*
* Usage: php PrintSolrQuery.php "search string" [type]
* - type defaults to "title"
*/

require_once __DIR__ . '/../sys/Solr.php';
require_once __DIR__ . '/../sys/SolrConnection.php';
require_once __DIR__ . '/../services/Search/SearchStructure.php';

function invokecollapseCompoundPhrases($solr, $tokens): array
{
$reflection = new ReflectionClass($solr);
$method = $reflection->getMethod('collapseCompoundPhrases');

return $method->invoke($solr, $tokens);
}

function invokebuildEscapedParts($solr, $tokens): array
{
$reflection = new ReflectionClass($solr);
$method = $reflection->getMethod('buildEscapedParts');

return $method->invoke($solr, $tokens);
}

$query = $argv[1] ?? '';
$type = $argv[2] ?? 'title';

if ($query === '') {
fwrite(STDERR, "Usage: php PrintSolrQuery.php \"search string\" [type]\n");
exit(1);
}

global $configArray;
$configArray = parse_ini_file(__DIR__ . '/../conf/config.ini', true);
$configArray['Site']['local'] = '/app';

$_SERVER['HTTP_HOST'] = $_SERVER['SERVER_ADDR'] = 'localhost';
$_REQUEST['lookfor'][] = $query;
$_REQUEST['type'][] = $type;
$_REQUEST['action'] = 'standard';
$_REQUEST['pagesize'] = 1;

$ss = new SearchStructure();

$solr = new Solr('', '');

if ($ss->use_dismax) {
$args = $solr->dismaxSearchArguments($ss);
} else {
$args = $solr->searchArguments($ss);
}


$tokenized_query = $solr->tokenizeInput($query);
print_r("----- Tokenized Search ----- : " . json_encode($tokenized_query, JSON_UNESCAPED_UNICODE));
print("\n");

$classified_tokens = $solr->classifyTokens($tokenized_query);
print_r("----- Classified Tokens ----- : " . json_encode($classified_tokens, JSON_UNESCAPED_UNICODE));
print("\n");

$tokens = invokecollapseCompoundPhrases($solr, $classified_tokens);
print_r("----- Tokens after collapsing compound phrases ----- : " . json_encode($tokens, JSON_UNESCAPED_UNICODE));
print("\n");

$escapedParts = invokebuildEscapedParts($solr, $tokens);
print_r("----- Escaped Parts ----- : " . json_encode($escapedParts, JSON_UNESCAPED_UNICODE));
print("\n");

$semanticStructure = $solr->build_and_or_onephrase($query);
print_r("----- Semantic Structure ----- : " . json_encode($semanticStructure, JSON_UNESCAPED_UNICODE));
print("\n");
print_r("----- Solr Search ----- : " . json_encode($args[0][1], JSON_UNESCAPED_UNICODE));
print("\n");
56 changes: 49 additions & 7 deletions sys/Solr.php
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ function simplesearch($ss, $start = 0, $limit = null, $raw = false) {
$args = array_merge($args, $this->spellcheckComponents($ss));
}

//print_r("----- Solr query ------: " . json_encode($args, JSON_UNESCAPED_UNICODE));
// print_r("----- Solr query ------: " . json_encode($args, JSON_UNESCAPED_UNICODE));

// $raw is always false, so rawSolrSearch is never used
if ($raw) {
Expand Down Expand Up @@ -795,8 +795,20 @@ private function __buildQueryString($structure, $values, $joiner = "OR") {
if (!isset($values[$val]) || ($values[$val] == "")) {
continue;
}
// All the values are generated by build_and_or_onephrase, so they are already escaped and quoted as needed, except for "asis" which is the raw user input that we want to search as a phrase.
// assis is used to generate other values as stdnum and lcnormalized, for that reason we don't want to escape in build_and_or_onephrase. We should scape here to ensure the query by id, issn and isbn don't break the Solr parser.
if ($val == 'asis') {
// If the value is "asis", we want to search it as a phrase, so we quote it and escape internal quotes
$escaped_value = $this->escapeTerm($values[$val]);
}
else {
// Otherwise, we just escape it as a term
$escaped_value = $values[$val];
}

$sstring = $field . ':(' . $escaped_value . ')';

$sstring = $field . ':(' . $values[$val] . ')';
// $sstring = $field . ':(' . $values[$val] . ')';
if (isset($weight) && $weight > 0) {
$sstring .= '^' . $weight;
}
Expand Down Expand Up @@ -1363,14 +1375,12 @@ private function buildPhraseToken(array $token): string
private function buildEscapedParts(array $tokens): array
{
$escapedParts = [];

foreach ($tokens as $t) {
$part = $this->buildEscapedPart($t);
if ($part !== null) {
$escapedParts[] = $part;
}
}

return $escapedParts;
}

Expand Down Expand Up @@ -1583,16 +1593,21 @@ public function build_and_or_onephrase($lookfor = null) {
// Classify tokens into phrases and terms
$tokens = $this->classifyTokens($rawTokens);

// print_r("Classified Tokens : " . json_encode($tokens, JSON_UNESCAPED_UNICODE));

// Collapse contiguous quoted phrase + operator sequences into compound phrases
$tokens = $this->collapseCompoundPhrases($tokens);

// print_r("Tokens after collapsing compound phrases : " . json_encode($tokens, JSON_UNESCAPED_UNICODE));

// print_r("Tokens : " . json_encode($tokens, JSON_UNESCAPED_UNICODE));

// Build semantic values from classified tokens and escape them properly for their intended use
$escapedParts = [];

// Create the escaped parts for phrases and terms, which will be used for building the different query types (onephrase, and, or)
$escapedParts = $this->buildEscapedParts($tokens);
//print_r("Escaped Parts : " . json_encode($escapedParts, JSON_UNESCAPED_UNICODE));

// Create the flatten tokens for debugging and building the as-is, compressed, exactmatcher, and emstartswith queries
$flattenTokes = $this->flattenTokens($tokens);
Expand Down Expand Up @@ -1638,14 +1653,14 @@ public function build_and_or_onephrase($lookfor = null) {
// As-is search - dramatic literature, comprehending critical
$values['asis'] = $flattenTokes;
// Compressed search - dramaticliterature,comprehendingcritical
$values['compressed'] = preg_replace('/\s/', '', $values['asis']);
$values['compressed'] = preg_replace('/\s/', '', $this->escapeTerm($values['asis']));
// Exactmatcher search - dramaticliteraturecomprehendingcritical
$values['exactmatcher'] = $this->exactmatcherify($flattenTokes);
// Exactmatcher startswith search - dramaticliteraturecomprehendingcritical*
// If the input is a phrase with a trailing wildcard, we want to preserve the wildcard in the exactmatcher startswith version. For example, "foo bar"* should become foo bar* for the startswith version, not foo bar*.
$values['emstartswith'] = str_replace('*', '', $values['exactmatcher']) . '*';

//print_r("Sematic Structure : " . json_encode($values, JSON_UNESCAPED_UNICODE));
// print_r("Sematic Structure : " . json_encode($values, JSON_UNESCAPED_UNICODE));

return $values;
}
Expand Down Expand Up @@ -1711,6 +1726,33 @@ function classifyTokens(array $tokens): array {
return $classified;
}

/*
* Collapse sequences of phrase-like tokens joined by boolean operators into compound phrases
* This is to handle cases where users input something like "foo bar" AND "baz qux" OR "quux corge", which should be treated as a single compound phrase for the onephrase version, rather than separate phrases joined by operators.
* The logic is:
* - Iterate through the classified tokens
* - When we encounter a phrase-like token (phrase, phrase_slop, term, term_wildcard, term_fuzzy), we look ahead to see if it is followed by a boolean operator and another phrase-like token
* - If we find a sequence of phrase-like tokens joined by boolean operators, we collapse them into a single 'compound_phrase' token that contains the sequence of tokens
* - This allows us to preserve the original structure of the query for the onephrase version, while still being able to build the AND and OR versions by filtering out the operators later
* Example:
* Input: [
* ['type' => 'phrase', 'value' => ['text' => 'foo bar', 'slop' => null]],
* ['type' => 'operator', 'value' => 'AND'],
* ['type' => 'phrase', 'value' => ['text' => 'baz qux', 'slop' => null]],
* ['type' => 'operator', 'value' => 'OR'],
* ['type' => 'phrase', 'value' => ['text' => 'quux corge', 'slop' => null]],
* ]
* Output: [
* ['type' => 'compound_phrase', 'value' => ['tokens' => [
* ['type' => 'phrase', 'value' => ['text' => 'foo bar', 'slop' => null]],
* ['type' => 'operator', 'value' => 'AND'],
* ['type' => 'phrase', 'value' => ['text' => 'baz qux', 'slop' => null]],
* ['type' => 'operator', 'value' => 'OR'],
* ['type => 'phrase', 'value' => ['text' => 'quux corge', 'slop' => null]],
* ]]],
* ]
* This way, the onephrase version can treat the whole sequence as a single unit, while the AND and OR versions can still apply the boolean logic by filtering out the operators.
*/
private function collapseCompoundPhrases(array $tokens): array
{
$collapsed = [];
Expand Down Expand Up @@ -1754,7 +1796,7 @@ private function collapseCompoundPhrases(array $tokens): array

private function isPhraseLikeToken(array $token): bool
{
return in_array($token['type'], ['phrase', 'phrase_slop'], true);
return in_array($token['type'], ['phrase', 'phrase_slop', 'term', 'term_wildcard', 'term_fuzzy'], true);
}

private function isBooleanJoinOperator(array $token): bool
Expand Down
78 changes: 78 additions & 0 deletions test/SolrQueryTest/ClassifySyntaxQueryTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,84 @@ public function testClassifyDoesNotSplitBooleanPhrase()

}

/**
* Ensure inline boolean connectors remain visible and combine correctly with phrases.
* @covers Solr::tokenizeInput
* @covers Solr::classifyTokens
* @covers Solr::collapseCompoundPhrases
* @covers Solr::buildEscapedParts
*/
public function testTokenizerANDCollapseCompoundPhrases()
{

// Checking tokenizer - Split input into tokens, incluing operators and phrases
$tokens = $this->solr->tokenizeInput('charles dickens OR "weekly"');


$this->assertSame(
['charles', 'dickens', 'OR', '"weekly"'],
$tokens
);

// Checking classifyTokens - Classify tokens into terms, operators, and phrases
$classifiedTokens = $this->solr->classifyTokens(['charles', 'dickens', 'OR', '"weekly"']);

$this->assertSame([
[ 'type' => 'term', 'value' => 'charles'],
[ 'type' => 'term', 'value' => 'dickens'],
[ 'type' => 'operator', 'value' => 'OR'],
['type' => 'phrase', 'value' => [ 'text' => 'weekly', 'slop' => null]]
],
$classifiedTokens
);

// Checking collapseCompoundPhrases - Combine terms and operators into compound phrases where appropriate
$collapseTokens = $this->invokecollapseCompoundPhrases($this->solr, $classifiedTokens);

$iCollapsedTokens = [
['type' => 'term', 'value' => 'charles'],
[ "type" => "compound_phrase",
"value" => [
"tokens" => [
[ "type" => "term", "value" => "dickens" ],
[ "type" => "operator", "value" => "OR" ],
[ "type" => "phrase", "value" => [ "text" => 'weekly', "slop" => null ] ]
]
]
]
];

$this->assertSame(
$iCollapsedTokens,
$collapseTokens
);

// Checking buildEscapedParts - Build escaped parts for Solr query construction
$escapedParts = $this->invokebuildEscapedParts($this->solr, $collapseTokens);
$this->assertSame(
['charles', 'dickens OR "weekly"'],
$escapedParts
);


}

private function invokecollapseCompoundPhrases($solr, $tokens): array
{
$reflection = new ReflectionClass($solr);
$method = $reflection->getMethod('collapseCompoundPhrases');

return $method->invoke($solr, $tokens);
}

private function invokebuildEscapedParts($solr, $tokens): array
{
$reflection = new ReflectionClass($solr);
$method = $reflection->getMethod('buildEscapedParts');

return $method->invoke($solr, $tokens);
}


}
?>
Loading