]> git.aero2k.de Git - tmp/jakarta-migration.git/commitdiff
DRYD-1315: Simplify keyword search preprocessing. (#387)
authorRay Lee <ray.lee@lyrasis.org>
Fri, 19 Jan 2024 21:04:55 +0000 (16:04 -0500)
committerGitHub <noreply@github.com>
Fri, 19 Jan 2024 21:04:55 +0000 (16:04 -0500)
services/common/src/main/java/org/collectionspace/services/common/query/nuxeo/QueryManagerNuxeoImpl.java

index 78c2b72eb625dca74c62d961f3e995e364140809..0058c73b1c3b8f772ec598ab5def8f8414ac2425 100644 (file)
@@ -53,23 +53,14 @@ public class QueryManagerNuxeoImpl implements IQueryManager {
        private final Logger logger = LoggerFactory
                        .getLogger(QueryManagerNuxeoImpl.class);
 
-       // Consider that letters, letter-markers, numbers, '_' and apostrophe are
-       // words
-       private static Pattern nonWordChars = Pattern
-                       .compile("[^\\p{L}\\p{M}\\p{N}_']");
-       private static Pattern kwdTokenizer = Pattern.compile("(?:(['\"])(.*?)(?<!\\\\)(?>\\\\\\\\)*\\1|([^ ]+))");
-       private static Pattern unescapedDblQuotes = Pattern.compile("(?<!\\\\)\"");
+       private static Pattern kwdTokenizer = Pattern.compile("(\".*?\")|\\S+");
        private static Pattern unescapedSingleQuote = Pattern.compile("(?<!\\\\)'");
-       //private static Pattern kwdSearchProblemChars = Pattern.compile("[\\:\\(\\)\\*\\%]");
-       // HACK to work around Nuxeo regression that tokenizes on '.'.
-       private static Pattern kwdSearchProblemChars = Pattern.compile("[\\:\\(\\)\\*\\%\\.]");
-       private static Pattern kwdSearchHyphen = Pattern.compile(" - ");
+       private static Pattern kwdSearchProblemChars = Pattern.compile("[^\\*\\d\\p{IsAlphabetic}\\\"]");
        private static Pattern advSearchSqlWildcard = Pattern.compile(".*?[I]*LIKE\\s*\\\"\\%\\\".*?");
        // Base Nuxeo document type for all CollectionSpace documents/resources
        public static String COLLECTIONSPACE_DOCUMENT_TYPE = "CollectionSpaceDocument";
        public static final String NUXEO_DOCUMENT_TYPE = "Document";
 
-
        private static String getLikeForm(String dataSourceName, String repositoryName, String cspaceInstanceId) {
                if (SEARCH_LIKE_FORM == null) {
                        try {
@@ -133,92 +124,63 @@ public class QueryManagerNuxeoImpl implements IQueryManager {
         * @see org.collectionspace.services.common.query.IQueryManager#
         * createWhereClauseFromKeywords(java.lang.String)
         */
-       // TODO handle keywords containing escaped punctuation chars, then we need
-       // to qualify the
-       // search by matching on the fulltext.simpletext field.
-       // TODO handle keywords containing unescaped double quotes by matching the
-       // phrase
-       // against the fulltext.simpletext field.
-       // Both these require using JDBC, since we cannot get to the fulltext table
-       // in NXQL
        @Override
        public String createWhereClauseFromKeywords(String keywords) {
-               String result = null;
                StringBuffer fullTextWhereClause = new StringBuffer();
-               // Split on unescaped double quotes to handle phrases
-               Matcher regexMatcher = kwdTokenizer.matcher(keywords.trim());
+
+               String cleanKeywords = kwdSearchProblemChars.matcher(keywords).replaceAll(" ").trim();
+               Matcher regexMatcher = kwdTokenizer.matcher(cleanKeywords);
+
                boolean addNOT = false;
                boolean newWordSet = true;
+
                while (regexMatcher.find()) {
                        String phrase = regexMatcher.group();
-                       // Not needed - already trimmed by split:
-                       // String trimmed = phrase.trim();
-                       // Ignore empty strings from match, or goofy input
-                       if (phrase.isEmpty())
+
+                       if (phrase.isEmpty()) {
+                               // Ignore empty strings from match, or goofy input
                                continue;
+                       }
+
                        // Note we let OR through as is
-                       if("AND".equalsIgnoreCase(phrase)) {
+                       if ("AND".equalsIgnoreCase(phrase)) {
                                continue;       // AND is default
-                       } else if("NOT".equalsIgnoreCase(phrase)) {
-                               addNOT = true;
-                               continue;
                        }
-                       // Next comment block of questionable value...
-
-                       // ignore the special chars except single quote here - can't hurt
-                       // TODO this should become a special function that strips things the
-                       // fulltext will ignore, including non-word chars and too-short
-                       // words,
-                       // and escaping single quotes. Can return a boolean for anything
-                       // stripped,
-                       // which triggers the back-up search. We can think about whether
-                       // stripping
-                       // short words not in a quoted phrase should trigger the backup.
-                       String escapedAndTrimmed = unescapedSingleQuote.matcher(phrase).replaceAll("\\\\'");
-                       // If there are non-word chars in the phrase, we need to match the
-                       // phrase exactly against the fulltext table for this object
-                       // if(nonWordChars.matcher(trimmed).matches()) {
-                       // }
-                       // Replace problem chars with spaces. Patches CSPACE-4147,
-                       // CSPACE-4106
-                       escapedAndTrimmed = kwdSearchProblemChars.matcher(escapedAndTrimmed).replaceAll(" ").trim();
-                       escapedAndTrimmed = kwdSearchHyphen.matcher(escapedAndTrimmed).replaceAll(" ").trim();
-                       if(escapedAndTrimmed.isEmpty()) {
-                               if (logger.isDebugEnabled() == true) {
-                                       logger.debug("Phrase reduced to empty after replacements: " + phrase);
-                               }
+
+                       if ("NOT".equalsIgnoreCase(phrase)) {
+                               addNOT = true;
                                continue;
                        }
 
-                       if (fullTextWhereClause.length()==0) {
+                       if (fullTextWhereClause.length() == 0) {
                                fullTextWhereClause.append(SEARCH_GROUP_OPEN);
                        }
+
                        if (newWordSet) {
                                fullTextWhereClause.append(ECM_FULLTEXT_LIKE + "'");
                                newWordSet = false;
                        } else {
                                fullTextWhereClause.append(SEARCH_TERM_SEPARATOR);
                        }
-                       if(addNOT) {
+
+                       if (addNOT) {
                                fullTextWhereClause.append("-");        // Negate the next term
                                addNOT = false;
                        }
-                       fullTextWhereClause.append(escapedAndTrimmed);
 
-                       if (logger.isTraceEnabled() == true) {
-                               logger.trace("Current built whereClause is: "
-                                               + fullTextWhereClause.toString());
-                       }
+                       fullTextWhereClause.append(phrase);
+
+                       logger.trace("Current built whereClause is: " + fullTextWhereClause.toString());
                }
-               if (fullTextWhereClause.length()==0) {
-                       if (logger.isDebugEnabled() == true) {
-                               logger.debug("No usable keywords specified in string:[" + keywords + "]");
-                       }
+
+               if (fullTextWhereClause.length() == 0) {
+                       logger.debug("No usable keywords specified in string: [" + keywords + "]");
                } else {
                        fullTextWhereClause.append("'" + SEARCH_GROUP_CLOSE);
                }
 
-               result = fullTextWhereClause.toString();
+               String result = fullTextWhereClause.toString();
+
                if (logger.isDebugEnabled()) {
                        logger.debug("Final built WHERE clause is: " + result);
                }