Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
195 changes: 149 additions & 46 deletions sys/Solr.php
Original file line number Diff line number Diff line change
Expand Up @@ -1365,37 +1365,67 @@ private function buildEscapedParts(array $tokens): array
$escapedParts = [];

foreach ($tokens as $t) {

switch ($t['type']) {

case 'phrase_slop':
$escapedParts[] = $this->buildPhraseToken($t);
break;
case 'phrase':
$escapedParts[] = $this->buildPhraseToken($t);
break;
case 'term':
$escapedParts[] = $this->escapeTerm($t['value']);
break;
case 'term_wildcard':
$escapedParts[] = $this->escapeTermKeepWildcardOperators($t['value']);
break;
case 'term_fuzzy':
if (preg_match('/^(.*?)(~\d+)$/', $t['value'], $m)) {
$escapedParts[] = $this->escapeTerm($m[1]) . $m[2];
} else {
$escapedParts[] = $this->escapeTerm($t['value']); // fallback safety
}
break;
case 'operator':
$escapedParts[] = strtoupper($t['value']);
break;
$part = $this->buildEscapedPart($t);
if ($part !== null) {
$escapedParts[] = $part;
}
}

return $escapedParts;
}

private function buildEscapedPart(array $token): ?string
{
switch ($token['type']) {
case 'phrase_slop':
case 'phrase':
return $this->buildPhraseToken($token);
case 'term':
return $this->escapeTerm($token['value']);
case 'term_wildcard':
return $this->escapeTermKeepWildcardOperators($token['value']);
case 'term_fuzzy':
if (preg_match('/^(.*?)(~\d+)$/', $token['value'], $m)) {
return $this->escapeTerm($m[1]) . $m[2];
}
return $this->escapeTerm($token['value']);
case 'operator':
return strtoupper($token['value']);
case 'compound_phrase':
return $this->buildCompoundPhrasePart($token);
default:
return null;
}
}

private function buildCompoundPhrasePart(array $token): ?string
{
if (!isset($token['value']['tokens']) || !is_array($token['value']['tokens'])) {
return null;
}

$innerParts = $this->buildEscapedParts($token['value']['tokens']);
if (empty($innerParts)) {
return null;
}

return implode(' ', $innerParts);
}

/**
* Wrap the provided string with Lucene-compliant double quotes, escaping
* any literal quotes or backslashes inside.
*/
private function quoteOnePhraseValue(string $value): string
{
$trimmed = trim($value);
if ($trimmed === '') {
return '';
}
$withoutQuotes = str_replace('"', '', $trimmed);
return '"' . $this->escapeLuceneLiteral($withoutQuotes) . '"';
}

/*
* Flatten tokens back into a query string (for debugging or final output)
* Input: array of tokens with type and value
Expand All @@ -1414,6 +1444,11 @@ private function flattenTokens(array $tokens): string
case 'phrase':
$parts[] = '"' . $t['value']['text'] . '"';
break;
case 'compound_phrase':
if (isset($t['value']['tokens']) && is_array($t['value']['tokens'])) {
$parts[] = '(' . $this->flattenTokens($t['value']['tokens']) . ')';
}
break;
case 'term':
case 'term_wildcard':
case 'term_fuzzy':
Expand Down Expand Up @@ -1545,8 +1580,11 @@ public function build_and_or_onephrase($lookfor = null) {
$rawTokens = $this->tokenizeInput($lookfor);
// print_r("Tokenize : " . json_encode($rawTokens, JSON_UNESCAPED_UNICODE));

// Classify tokens into phrases and terms
$tokens = $this->classifyTokens($rawTokens);
// Classify tokens into phrases and terms
$tokens = $this->classifyTokens($rawTokens);

// Collapse contiguous quoted phrase + operator sequences into compound phrases
$tokens = $this->collapseCompoundPhrases($tokens);

// print_r("Tokens : " . json_encode($tokens, JSON_UNESCAPED_UNICODE));

Expand All @@ -1561,26 +1599,40 @@ public function build_and_or_onephrase($lookfor = null) {

// print_r("Escaped Parts : " . json_encode($escapedParts, JSON_UNESCAPED_UNICODE));

// TODO: In the future, $onephraseValue could be built in a more sophisticated way to preserve the original structure
// of the query as much as possible, while still escaping special characters.
// For example, we could keep the boolean operators in place for the onephrase version,
// but escape the terms and phrases properly.
// This would allow us to generate a more accurate onephrase query that reflects the user's original intent,
// while still being safe for Solr.
// For now, we will just join the escaped parts with spaces for the onephrase version, which is a simple approach that works for basic cases.
// The above are the escaped versions for their intended use in Solr queries.
$hasOperator = false;
foreach ($tokens as $t) {
if ($t['type'] === 'operator') {
$hasOperator = true;
break;
}
}
$onephraseValue = implode(' ', $escapedParts);

// This line capture historic behavior of the application, which is to generate a
// onephrase query by joining the escaped parts with spaces and wrapping the whole thing in quotes,
// which is a simple approach to retrieve results that match all the terms in the query, regardless of their order or proximity.
// In the future, we could enhance this logic to preserve the original structure of the query as much as possible,
// while still escaping special characters.
// Phrase search - "dramatic literature, comprehending critical"
$values['onephrase'] = implode(' ', $escapedParts);
if ($hasOperator) {
// Preserve explicit user boolean intent; do not auto-insert extra operators.
// AND search - dramatic AND literature, AND comprehending AND critical
// OR search - dramatic OR literature, OR comprehending OR critical
$values['and'] = $values['onephrase'];
$values['or'] = $values['onephrase'];
} else {
$values['and'] = implode(' AND ', $escapedParts);
$values['or'] = implode(' OR ', $escapedParts);
}
$values['onephrase'] = $this->quoteOnePhraseValue($onephraseValue);

// For AND and OR, we want to preserve the boolean operators as they are, but we want to remove
// them from the terms when building the AND and OR versions.
// For example, if the input is "dramatic AND literature OR comprehending NOT critical",
// we want to keep the operators for the onephrase version, but for the
// AND version we want "dramatic literature comprehending critical" with AND between the terms,
// We can achieve this by filtering out the operator tokens when building the AND and OR versions.
// This avoids to generate phrase like dramatic AND AND literature or dramatic OR OR literature,
// which causes syntax errors in Solr.
$operatorTokens = ['AND', 'OR', 'NOT'];
$termsOnlyParts = array_filter($escapedParts, function ($part) use ($operatorTokens) {
return !in_array(strtoupper($part), $operatorTokens, true);
});
// AND search - dramatic AND literature, AND comprehending AND critical
// OR search - dramatic OR literature, OR comprehending OR critical
$values['and'] = implode(' AND ', $termsOnlyParts);
$values['or'] = implode(' OR ', $termsOnlyParts);

// The below are the raw flattened versions for debugging and building other query types.
// As-is search - dramatic literature, comprehending critical
Expand All @@ -1593,7 +1645,7 @@ public function build_and_or_onephrase($lookfor = null) {
// If the input is a phrase with a trailing wildcard, we want to preserve the wildcard in the exactmatcher startswith version. For example, "foo bar"* should become foo bar* for the startswith version, not foo bar*.
$values['emstartswith'] = str_replace('*', '', $values['exactmatcher']) . '*';

// print_r("Sematic Structure : " . json_encode($values, JSON_UNESCAPED_UNICODE));
//print_r("Sematic Structure : " . json_encode($values, JSON_UNESCAPED_UNICODE));

return $values;
}
Expand Down Expand Up @@ -1659,6 +1711,57 @@ function classifyTokens(array $tokens): array {
return $classified;
}

private function collapseCompoundPhrases(array $tokens): array
{
$collapsed = [];
$total = count($tokens);
$index = 0;

while ($index < $total) {
$current = $tokens[$index];

if ($this->isPhraseLikeToken($current)) {
$sequence = [$current];
$lookahead = $index + 1;

while ($lookahead + 1 < $total
&& $this->isBooleanJoinOperator($tokens[$lookahead])
&& $this->isPhraseLikeToken($tokens[$lookahead + 1])) {
$sequence[] = $tokens[$lookahead];
$sequence[] = $tokens[$lookahead + 1];
$lookahead += 2;
}

if (count($sequence) >= 3) {
$collapsed[] = [
'type' => 'compound_phrase',
'value' => [
'tokens' => $sequence,
],
];
$index = $lookahead;
continue;
}
}

$collapsed[] = $current;
$index++;
}
// print("Collapsed Tokens : " . json_encode($collapsed, JSON_UNESCAPED_UNICODE));

return $collapsed;
}

private function isPhraseLikeToken(array $token): bool
{
return in_array($token['type'], ['phrase', 'phrase_slop'], true);
}

private function isBooleanJoinOperator(array $token): bool
{
return $token['type'] === 'operator' && in_array(strtoupper($token['value']), ['AND', 'OR', 'NOT'], true);
}

/**
* Construct, perform, and process the search
*
Expand Down
23 changes: 18 additions & 5 deletions test/SolrQueryTest/BuildAndOrOnePhraseTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ public function testAllowsFuzzyTerm()
$result = $this->solr->build_and_or_onephrase('table~2');

$this->assertIsArray($result);
$this->assertEquals('table~2', $result['onephrase']);
$this->assertEquals('"table~2"', $result['onephrase']);
$this->assertEquals('table~2', $result['asis']);
}

Expand All @@ -72,7 +72,7 @@ public function testAllowsQuotedFuzzyPhrase()
{
$result = $this->solr->build_and_or_onephrase('"table chair"~2');
$this->assertIsArray($result);
$this->assertEquals('"table chair"~2', $result['onephrase']);
$this->assertEquals('"table chair~2"', $result['onephrase']);
$this->assertEquals('"table chair"~2', $result['asis']);
}

Expand All @@ -85,7 +85,7 @@ public function testAllowsNormalTerm()
$result = $this->solr->build_and_or_onephrase('table');

$this->assertIsArray($result);
$this->assertEquals('table', $result['onephrase']);
$this->assertEquals('"table"', $result['onephrase']);
$this->assertEquals('table', $result['asis']);
}

Expand All @@ -98,11 +98,24 @@ public function testAllowsWildcardTerm()
$result = $this->solr->build_and_or_onephrase('table*');

$this->assertIsArray($result);
$this->assertEquals('table*', $result['onephrase']);
$this->assertEquals('"table*"', $result['onephrase']);
$this->assertEquals('table*', $result['asis']);
$this->assertEquals('table*', $result['emstartswith']);
}

/**
* @covers Solr::build_and_or_onephrase
*/
public function testCompoundPhrasesKeepOperatorsInTermsOnly(): void
{
$result = $this->solr->build_and_or_onephrase('"its origin" OR "illustrant"');
$this->assertIsArray($result);
$expected = '"its origin" OR "illustrant"';
$this->assertSame($expected, $result['and']);
$this->assertSame($expected, $result['or']);
$this->assertStringContainsString('OR', $result['asis']);
}

/**
* @covers Solr::build_and_or_onephrase
*/
Expand Down Expand Up @@ -135,4 +148,4 @@ public function testMultipleValidationErrorsAreHandledIteratively(): void
}

}
?>
?>
34 changes: 30 additions & 4 deletions test/SolrQueryTest/SolrEscapingTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,7 @@ public function testBuildEscapedPartsSpecialCharacterCombinations(): void
}

/**
* @covers Solr::buildEscapedParts
* @covers Solr::buildEscapedParts
*/
public function testBuildEscapedPartsRealWorldQueryExample(): void
{
Expand All @@ -278,10 +278,36 @@ public function testBuildEscapedPartsRealWorldQueryExample(): void
'C\\+\\+',
'OR',
'title\\:history\\/*',
'NOT',
'library?',
'NOT',
'library?',
],
$escapedParts
);
}

/**
* @covers Solr::buildEscapedParts
*/
public function testBuildEscapedPartsProductivelyCompoundsPhrases(): void
{
$solr = new Solr('', '');

$tokens = [
[
'type' => 'compound_phrase',
'value' => [
'tokens' => [
['type' => 'phrase', 'value' => ['text' => 'its origin', 'slop' => null]],
['type' => 'operator', 'value' => 'OR'],
['type' => 'phrase', 'value' => ['text' => 'illustrant', 'slop' => null]],
],
],
],
$escapedParts
];

$this->assertSame(
['"its origin" OR "illustrant"'],
$this->invokeBuildEscapedParts($solr, $tokens)
);
}

Expand Down
6 changes: 3 additions & 3 deletions test/SolrQueryTest/SolrQueryFullPipelineTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -78,11 +78,11 @@ public function test_queryFullPipeline(): void
'title_a:(smart)^15000', // exactmatcher
'titleProper:(smart*)^8000', // emstartswith

'titleProper:(smart)^1200', // onephrase field
'titleProper:("smart")^1200', // onephrase field

'title_topProper:(smart)^600', // onephrase field
'title_topProper:("smart")^600', // onephrase field

'series2:(smart)^500' // onephrase field
'series2:("smart")^500' // onephrase field
];

foreach ($expectedClauses as $clause) {
Expand Down
Loading