hathitrust · liseli · Mar 25, 2026 · Mar 17, 2026
diff --git a/README.md b/README.md
@@ -35,6 +35,25 @@ docker compose build
 docker compose run --rm playwright
 ```
 
+## How to run a PHP script that shows the Solr query
+
+```bash
+docker compose run vufind php bin/PrintSolrQuery.php 'charles dickens OR "weekly"' title 
+```
+
+The output of this script loooks like:
+
+```bash
+
+----- Tokenized Search ----- : ["charles","dickens","OR","\"weekly\""]
+ ----- Classified Tokens ----- : [{"type":"term","value":"charles"},{"type":"term","value":"dickens"},{"type":"operator","value":"OR"},{"type":"phrase","value":{"text":"weekly","slop":null}}]
+ ----- Tokens after collapsing compound phrases ----- : [{"type":"term","value":"charles"},{"type":"compound_phrase","value":{"tokens":[{"type":"term","value":"dickens"},{"type":"operator","value":"OR"},{"type":"phrase","value":{"text":"weekly","slop":null}}]}}]
+ ----- Escaped Parts ----- : ["charles","dickens OR \"weekly\""]
+ ----- Semantic Structure ----- : {"onephrase":"\"charles dickens OR weekly\"","and":"charles AND dickens OR \"weekly\"","or":"charles OR dickens OR \"weekly\"","asis":"charles (dickens OR \"weekly\")","compressed":"charles\\(dickensOR\\\"weekly\\\"\\)","exactmatcher":"charlesdickensorweekly","emstartswith":"charlesdickensorweekly*"}
+ -----  Solr Search ----- : "(title_ab:(charlesdickensorweekly)^25000 OR title_a:(charlesdickensorweekly)^15000 OR titleProper:(charlesdickensorweekly*)^8000 OR titleProper:(\"charles dickens OR weekly\")^1200 OR titleProper:(charles AND dickens OR \"weekly\")^120 OR title_topProper:(\"charles dickens OR weekly\")^600 OR title_topProper:(charles AND dickens OR \"weekly\")^60 OR title_restProper:(\"charles dickens OR weekly\")^400 OR title_restProper:(charles AND dickens OR \"weekly\")^40 OR series:(\"charles dickens OR weekly\")^500 OR series:(charles AND dickens OR \"weekly\")^50 OR series2:(\"charles dickens OR weekly\")^500 OR series2:(charles AND dickens OR \"weekly\")^50 OR title:(charles AND dickens OR \"weekly\")^30 OR title_top:(charles AND dickens OR \"weekly\")^20 OR title_rest:(charles AND dickens OR \"weekly\")^1)"
+
+```
+
 ## What Works
 
 See all records with http://localhost:8080/Search/Home

diff --git a/bin/PrintSolrQuery.php b/bin/PrintSolrQuery.php
@@ -0,0 +1,83 @@
+#!/usr/bin/env php
+<?php
+/**
+ * Standalone helper that builds a SearchStructure from a single user query
+ * string, runs it through Solr::simplesearch, and prints the generated Solr
+ * search arguments.
+ *
+ * This script can be run from the project root.
+ *
+ * Usage: php PrintSolrQuery.php "search string" [type]
+ *   - type defaults to "title"
+ */
+
+require_once __DIR__ . '/../sys/Solr.php';
+require_once __DIR__ . '/../sys/SolrConnection.php';
+require_once __DIR__ . '/../services/Search/SearchStructure.php';
+
+function invokecollapseCompoundPhrases($solr, $tokens): array
+    {
+        $reflection = new ReflectionClass($solr);
+        $method = $reflection->getMethod('collapseCompoundPhrases');
+
+        return $method->invoke($solr, $tokens);
+    }
+
+function invokebuildEscapedParts($solr, $tokens): array
+    {
+        $reflection = new ReflectionClass($solr);
+        $method = $reflection->getMethod('buildEscapedParts');
+
+        return $method->invoke($solr, $tokens);
+    }
+
+$query = $argv[1] ?? '';
+$type = $argv[2] ?? 'title';
+
+if ($query === '') {
+    fwrite(STDERR, "Usage: php PrintSolrQuery.php \"search string\" [type]\n");
+    exit(1);
+}
+
+global $configArray;
+$configArray = parse_ini_file(__DIR__ . '/../conf/config.ini', true);
+$configArray['Site']['local'] = '/app';
+
+$_SERVER['HTTP_HOST'] = $_SERVER['SERVER_ADDR'] = 'localhost';
+$_REQUEST['lookfor'][] = $query;
+$_REQUEST['type'][] = $type;
+$_REQUEST['action'] = 'standard';
+$_REQUEST['pagesize'] = 1;
+
+$ss = new SearchStructure();
+
+$solr = new Solr('', '');
+
+if ($ss->use_dismax) {
+    $args = $solr->dismaxSearchArguments($ss);
+} else {
+    $args = $solr->searchArguments($ss);
+}
+
+
+$tokenized_query = $solr->tokenizeInput($query);
+print_r("----- Tokenized Search ----- : " . json_encode($tokenized_query, JSON_UNESCAPED_UNICODE));
+print("\n");
+
+$classified_tokens = $solr->classifyTokens($tokenized_query);
+print_r("----- Classified Tokens ----- : " . json_encode($classified_tokens, JSON_UNESCAPED_UNICODE));
+print("\n");
+
+$tokens = invokecollapseCompoundPhrases($solr, $classified_tokens);
+print_r("----- Tokens after collapsing compound phrases ----- : " . json_encode($tokens, JSON_UNESCAPED_UNICODE));
+print("\n");
+
+$escapedParts = invokebuildEscapedParts($solr, $tokens);
+print_r("----- Escaped Parts ----- : " . json_encode($escapedParts, JSON_UNESCAPED_UNICODE));
+print("\n");
+
+$semanticStructure = $solr->build_and_or_onephrase($query);
+print_r("----- Semantic Structure ----- : " . json_encode($semanticStructure, JSON_UNESCAPED_UNICODE));
+print("\n");
+print_r("-----  Solr Search ----- : " . json_encode($args[0][1], JSON_UNESCAPED_UNICODE));
+print("\n");
diff --git a/sys/Solr.php b/sys/Solr.php
@@ -129,7 +129,7 @@ function simplesearch($ss, $start = 0, $limit = null, $raw = false) {
       $args = array_merge($args, $this->spellcheckComponents($ss));
     }
 
-    //print_r("----- Solr query ------: " . json_encode($args, JSON_UNESCAPED_UNICODE));
+    // print_r("----- Solr query ------: " . json_encode($args, JSON_UNESCAPED_UNICODE));
 
     // $raw is always false, so rawSolrSearch is never used
     if ($raw) {
@@ -795,8 +795,20 @@ private function __buildQueryString($structure, $values, $joiner = "OR") {
         if (!isset($values[$val]) || ($values[$val] == "")) {
           continue;
         }
+        // All the values are generated by build_and_or_onephrase, so they are already escaped and quoted as needed, except for "asis" which is the raw user input that we want to search as a phrase.
+        // assis is used to generate other values as stdnum and lcnormalized, for that reason we don't want to escape in build_and_or_onephrase. We should scape here to ensure the query by id, issn and isbn don't break the Solr parser.
+        if ($val == 'asis') {
+          // If the value is "asis", we want to search it as a phrase, so we quote it and escape internal quotes
+          $escaped_value = $this->escapeTerm($values[$val]);
+        }
+        else {
+            // Otherwise, we just escape it as a term
+            $escaped_value = $values[$val];
+        }
+
+        $sstring = $field . ':(' . $escaped_value . ')';
 
-        $sstring = $field . ':(' . $values[$val] . ')';
+        // $sstring = $field . ':(' . $values[$val] . ')';
         if (isset($weight) && $weight > 0) {
           $sstring .= '^' . $weight;
         }
@@ -1363,14 +1375,12 @@ private function buildPhraseToken(array $token): string
   private function buildEscapedParts(array $tokens): array
   {
     $escapedParts = [];
-
     foreach ($tokens as $t) {
         $part = $this->buildEscapedPart($t);
         if ($part !== null) {
             $escapedParts[] = $part;
         }
     }
-
     return $escapedParts;
   }
 
@@ -1583,16 +1593,21 @@ public function build_and_or_onephrase($lookfor = null) {
     // Classify tokens into phrases and terms
     $tokens = $this->classifyTokens($rawTokens);
 
+    // print_r("Classified Tokens : " . json_encode($tokens, JSON_UNESCAPED_UNICODE));
+
     // Collapse contiguous quoted phrase + operator sequences into compound phrases
     $tokens = $this->collapseCompoundPhrases($tokens);
 
+    // print_r("Tokens after collapsing compound phrases : " . json_encode($tokens, JSON_UNESCAPED_UNICODE));
+
     // print_r("Tokens : " . json_encode($tokens, JSON_UNESCAPED_UNICODE));
 
     // Build semantic values from classified tokens and escape them properly for their intended use
     $escapedParts = [];
 
     // Create the escaped parts for phrases and terms, which will be used for building the different query types (onephrase, and, or)
     $escapedParts = $this->buildEscapedParts($tokens);
+    //print_r("Escaped Parts : " . json_encode($escapedParts, JSON_UNESCAPED_UNICODE));
 
     // Create the flatten tokens for debugging and building the as-is, compressed, exactmatcher, and emstartswith queries
     $flattenTokes = $this->flattenTokens($tokens);
@@ -1638,14 +1653,14 @@ public function build_and_or_onephrase($lookfor = null) {
     // As-is search - dramatic literature, comprehending critical
     $values['asis'] = $flattenTokes;
     // Compressed search - dramaticliterature,comprehendingcritical
-    $values['compressed'] = preg_replace('/\s/', '', $values['asis']);
+    $values['compressed'] = preg_replace('/\s/', '', $this->escapeTerm($values['asis']));
     // Exactmatcher search - dramaticliteraturecomprehendingcritical
     $values['exactmatcher'] = $this->exactmatcherify($flattenTokes);
     // Exactmatcher startswith search - dramaticliteraturecomprehendingcritical*
     // If the input is a phrase with a trailing wildcard, we want to preserve the wildcard in the exactmatcher startswith version. For example, "foo bar"* should become foo bar* for the startswith version, not foo bar*.
     $values['emstartswith'] = str_replace('*', '', $values['exactmatcher']) . '*';
 
-    //print_r("Sematic Structure : " . json_encode($values, JSON_UNESCAPED_UNICODE));
+    // print_r("Sematic Structure : " . json_encode($values, JSON_UNESCAPED_UNICODE));
 
     return $values;
   }
@@ -1711,6 +1726,33 @@ function classifyTokens(array $tokens): array {
     return $classified;
   }
 
+  /*
+    * Collapse sequences of phrase-like tokens joined by boolean operators into compound phrases
+    * This is to handle cases where users input something like "foo bar" AND "baz qux" OR "quux corge", which should be treated as a single compound phrase for the onephrase version, rather than separate phrases joined by operators. 
+    * The logic is:
+    * - Iterate through the classified tokens
+    * - When we encounter a phrase-like token (phrase, phrase_slop, term, term_wildcard, term_fuzzy), we look ahead to see if it is followed by a boolean operator and another phrase-like token
+    * - If we find a sequence of phrase-like tokens joined by boolean operators, we collapse them into a single 'compound_phrase' token that contains the sequence of tokens
+    * - This allows us to preserve the original structure of the query for the onephrase version, while still being able to build the AND and OR versions by filtering out the operators later
+    * Example:
+    * Input: [
+    *   ['type' => 'phrase', 'value' => ['text' => 'foo bar', 'slop' => null]],
+    *   ['type' => 'operator', 'value' => 'AND'],
+    *   ['type' => 'phrase', 'value' => ['text' => 'baz qux', 'slop' => null]],
+    *   ['type' => 'operator', 'value' => 'OR'],
+    *   ['type' => 'phrase', 'value' => ['text' => 'quux corge', 'slop' => null]],
+    * ]
+    * Output: [
+    *   ['type' => 'compound_phrase', 'value' => ['tokens' => [
+    *     ['type' => 'phrase', 'value' => ['text' => 'foo bar', 'slop' => null]],
+    *     ['type' => 'operator', 'value' => 'AND'],
+    *     ['type' => 'phrase', 'value' => ['text' => 'baz qux', 'slop' => null]],
+    *     ['type' => 'operator', 'value' => 'OR'],
+    *     ['type => 'phrase', 'value' => ['text' => 'quux corge', 'slop' => null]],
+    *   ]]],
+    * ]
+    * This way, the onephrase version can treat the whole sequence as a single unit, while the AND and OR versions can still apply the boolean logic by filtering out the operators.
+  */
   private function collapseCompoundPhrases(array $tokens): array
   {
     $collapsed = [];
@@ -1754,7 +1796,7 @@ private function collapseCompoundPhrases(array $tokens): array
 
   private function isPhraseLikeToken(array $token): bool
   {
-    return in_array($token['type'], ['phrase', 'phrase_slop'], true);
+    return in_array($token['type'], ['phrase', 'phrase_slop', 'term', 'term_wildcard', 'term_fuzzy'], true);
   }
 
   private function isBooleanJoinOperator(array $token): bool

diff --git a/test/SolrQueryTest/ClassifySyntaxQueryTest.php b/test/SolrQueryTest/ClassifySyntaxQueryTest.php
@@ -185,6 +185,84 @@ public function testClassifyDoesNotSplitBooleanPhrase()
 
     }
 
+    /**
+     * Ensure inline boolean connectors remain visible and combine correctly with phrases.
+     * @covers Solr::tokenizeInput
+     * @covers Solr::classifyTokens
+     * @covers Solr::collapseCompoundPhrases
+     * @covers Solr::buildEscapedParts
+     */
+    public function testTokenizerANDCollapseCompoundPhrases()
+    {
+
+       // Checking tokenizer - Split input into tokens, incluing operators and phrases
+       $tokens = $this->solr->tokenizeInput('charles dickens OR "weekly"');
+
+
+       $this->assertSame(
+                ['charles', 'dickens', 'OR', '"weekly"'],
+            $tokens
+       );
+
+       // Checking classifyTokens - Classify tokens into terms, operators, and phrases 
+       $classifiedTokens = $this->solr->classifyTokens(['charles', 'dickens', 'OR', '"weekly"']);
+
+        $this->assertSame([
+            [ 'type' => 'term', 'value' => 'charles'],
+            [ 'type' => 'term', 'value' => 'dickens'],
+            [ 'type' => 'operator', 'value' => 'OR'],
+            ['type' => 'phrase', 'value' => [ 'text' => 'weekly', 'slop' => null]]
+        ],
+             $classifiedTokens
+        );
+
+        // Checking collapseCompoundPhrases - Combine terms and operators into compound phrases where appropriate
+        $collapseTokens = $this->invokecollapseCompoundPhrases($this->solr, $classifiedTokens);
+
+        $iCollapsedTokens = [
+        ['type' => 'term', 'value' => 'charles'],
+        [ "type" => "compound_phrase",
+        "value" => [
+            "tokens" => [
+                [ "type" => "term", "value" => "dickens" ],
+                [ "type" => "operator",  "value" => "OR" ],
+                [ "type" => "phrase", "value" => [ "text" => 'weekly', "slop" => null ] ]
+                  ]
+            ]
+      ]
+      ];
+
+        $this->assertSame(
+            $iCollapsedTokens,
+            $collapseTokens
+        );
+
+        // Checking buildEscapedParts - Build escaped parts for Solr query construction
+        $escapedParts = $this->invokebuildEscapedParts($this->solr, $collapseTokens);
+        $this->assertSame(
+             ['charles', 'dickens OR "weekly"'],
+             $escapedParts
+        );
+
+
+    }
+
+    private function invokecollapseCompoundPhrases($solr, $tokens): array
+    {
+        $reflection = new ReflectionClass($solr);
+        $method = $reflection->getMethod('collapseCompoundPhrases');
+
+        return $method->invoke($solr, $tokens);
+    }
+
+    private function invokebuildEscapedParts($solr, $tokens): array
+    {
+        $reflection = new ReflectionClass($solr);
+        $method = $reflection->getMethod('buildEscapedParts');
+
+        return $method->invoke($solr, $tokens);
+    }
+
 
 }
 ?>