DRYD-1315: Simplify keyword search preprocessing. (#387)

author Ray Lee <ray.lee@lyrasis.org>

Fri, 19 Jan 2024 21:04:55 +0000 (16:04 -0500)

committer GitHub <noreply@github.com>

Fri, 19 Jan 2024 21:04:55 +0000 (16:04 -0500)
author Ray Lee <ray.lee@lyrasis.org>
Fri, 19 Jan 2024 21:04:55 +0000 (16:04 -0500)
committer GitHub <noreply@github.com>
Fri, 19 Jan 2024 21:04:55 +0000 (16:04 -0500)
diff --git a/services/common/src/main/java/org/collectionspace/services/common/query/nuxeo/QueryManagerNuxeoImpl.java b/services/common/src/main/java/org/collectionspace/services/common/query/nuxeo/QueryManagerNuxeoImpl.java

index 78c2b72eb625dca74c62d961f3e995e364140809..0058c73b1c3b8f772ec598ab5def8f8414ac2425 100644 (file)
--- a/services/common/src/main/java/org/collectionspace/services/common/query/nuxeo/QueryManagerNuxeoImpl.java
+++ b/services/common/src/main/java/org/collectionspace/services/common/query/nuxeo/QueryManagerNuxeoImpl.java
@@ -53,23 +53,14 @@ public class QueryManagerNuxeoImpl implements IQueryManager {
         private final Logger logger = LoggerFactory
                         .getLogger(QueryManagerNuxeoImpl.class);
  
-       // Consider that letters, letter-markers, numbers, '_' and apostrophe are
-       // words
-       private static Pattern nonWordChars = Pattern
-                       .compile("[^\\p{L}\\p{M}\\p{N}_']");
-       private static Pattern kwdTokenizer = Pattern.compile("(?:(['\"])(.*?)(?<!\\\\)(?>\\\\\\\\)*\\1|([^ ]+))");
-       private static Pattern unescapedDblQuotes = Pattern.compile("(?<!\\\\)\"");
+       private static Pattern kwdTokenizer = Pattern.compile("(\".*?\")|\\S+");
         private static Pattern unescapedSingleQuote = Pattern.compile("(?<!\\\\)'");
-       //private static Pattern kwdSearchProblemChars = Pattern.compile("[\\:\\(\\)\\*\\%]");
-       // HACK to work around Nuxeo regression that tokenizes on '.'.
-       private static Pattern kwdSearchProblemChars = Pattern.compile("[\\:\\(\\)\\*\\%\\.]");
-       private static Pattern kwdSearchHyphen = Pattern.compile(" - ");
+       private static Pattern kwdSearchProblemChars = Pattern.compile("[^\\*\\d\\p{IsAlphabetic}\\\"]");
         private static Pattern advSearchSqlWildcard = Pattern.compile(".*?[I]*LIKE\\s*\\\"\\%\\\".*?");
         // Base Nuxeo document type for all CollectionSpace documents/resources
         public static String COLLECTIONSPACE_DOCUMENT_TYPE = "CollectionSpaceDocument";
         public static final String NUXEO_DOCUMENT_TYPE = "Document";
  
-
         private static String getLikeForm(String dataSourceName, String repositoryName, String cspaceInstanceId) {
                 if (SEARCH_LIKE_FORM == null) {
                         try {
@@ -133,92 +124,63 @@ public class QueryManagerNuxeoImpl implements IQueryManager {
          * @see org.collectionspace.services.common.query.IQueryManager#
          * createWhereClauseFromKeywords(java.lang.String)
          */
-       // TODO handle keywords containing escaped punctuation chars, then we need
-       // to qualify the
-       // search by matching on the fulltext.simpletext field.
-       // TODO handle keywords containing unescaped double quotes by matching the
-       // phrase
-       // against the fulltext.simpletext field.
-       // Both these require using JDBC, since we cannot get to the fulltext table
-       // in NXQL
         @Override
         public String createWhereClauseFromKeywords(String keywords) {
-               String result = null;
                 StringBuffer fullTextWhereClause = new StringBuffer();
-               // Split on unescaped double quotes to handle phrases
-               Matcher regexMatcher = kwdTokenizer.matcher(keywords.trim());
+
+               String cleanKeywords = kwdSearchProblemChars.matcher(keywords).replaceAll(" ").trim();
+               Matcher regexMatcher = kwdTokenizer.matcher(cleanKeywords);
+
                 boolean addNOT = false;
                 boolean newWordSet = true;
+
                 while (regexMatcher.find()) {
                         String phrase = regexMatcher.group();
-                       // Not needed - already trimmed by split:
-                       // String trimmed = phrase.trim();
-                       // Ignore empty strings from match, or goofy input
-                       if (phrase.isEmpty())
+
+                       if (phrase.isEmpty()) {
+                               // Ignore empty strings from match, or goofy input
                                 continue;
+                       }
+
                         // Note we let OR through as is
-                       if("AND".equalsIgnoreCase(phrase)) {
+                       if ("AND".equalsIgnoreCase(phrase)) {
                                 continue;       // AND is default
-                       } else if("NOT".equalsIgnoreCase(phrase)) {
-                               addNOT = true;
-                               continue;
                         }
-                       // Next comment block of questionable value...
-
-                       // ignore the special chars except single quote here - can't hurt
-                       // TODO this should become a special function that strips things the
-                       // fulltext will ignore, including non-word chars and too-short
-                       // words,
-                       // and escaping single quotes. Can return a boolean for anything
-                       // stripped,
-                       // which triggers the back-up search. We can think about whether
-                       // stripping
-                       // short words not in a quoted phrase should trigger the backup.
-                       String escapedAndTrimmed = unescapedSingleQuote.matcher(phrase).replaceAll("\\\\'");
-                       // If there are non-word chars in the phrase, we need to match the
-                       // phrase exactly against the fulltext table for this object
-                       // if(nonWordChars.matcher(trimmed).matches()) {
-                       // }
-                       // Replace problem chars with spaces. Patches CSPACE-4147,
-                       // CSPACE-4106
-                       escapedAndTrimmed = kwdSearchProblemChars.matcher(escapedAndTrimmed).replaceAll(" ").trim();
-                       escapedAndTrimmed = kwdSearchHyphen.matcher(escapedAndTrimmed).replaceAll(" ").trim();
-                       if(escapedAndTrimmed.isEmpty()) {
-                               if (logger.isDebugEnabled() == true) {
-                                       logger.debug("Phrase reduced to empty after replacements: " + phrase);
-                               }
+
+                       if ("NOT".equalsIgnoreCase(phrase)) {
+                               addNOT = true;
                                 continue;
                         }
  
-                       if (fullTextWhereClause.length()==0) {
+                       if (fullTextWhereClause.length() == 0) {
                                 fullTextWhereClause.append(SEARCH_GROUP_OPEN);
                         }
+
                         if (newWordSet) {
                                 fullTextWhereClause.append(ECM_FULLTEXT_LIKE + "'");
                                 newWordSet = false;
                         } else {
                                 fullTextWhereClause.append(SEARCH_TERM_SEPARATOR);
                         }
-                       if(addNOT) {
+
+                       if (addNOT) {
                                 fullTextWhereClause.append("-");        // Negate the next term
                                 addNOT = false;
                         }
-                       fullTextWhereClause.append(escapedAndTrimmed);
  
-                       if (logger.isTraceEnabled() == true) {
-                               logger.trace("Current built whereClause is: "
-                                               + fullTextWhereClause.toString());
-                       }
+                       fullTextWhereClause.append(phrase);
+
+                       logger.trace("Current built whereClause is: " + fullTextWhereClause.toString());
                 }
-               if (fullTextWhereClause.length()==0) {
-                       if (logger.isDebugEnabled() == true) {
-                               logger.debug("No usable keywords specified in string:[" + keywords + "]");
-                       }
+
+               if (fullTextWhereClause.length() == 0) {
+                       logger.debug("No usable keywords specified in string: [" + keywords + "]");
                 } else {
                         fullTextWhereClause.append("'" + SEARCH_GROUP_CLOSE);
                 }
  
-               result = fullTextWhereClause.toString();
+               String result = fullTextWhereClause.toString();
+
                 if (logger.isDebugEnabled()) {
                         logger.debug("Final built WHERE clause is: " + result);
                 }
author	Ray Lee <ray.lee@lyrasis.org>
	Fri, 19 Jan 2024 21:04:55 +0000 (16:04 -0500)
committer	GitHub <noreply@github.com>
	Fri, 19 Jan 2024 21:04:55 +0000 (16:04 -0500)