]> git.aero2k.de Git - tmp/jakarta-migration.git/commitdiff
UCJEPS-712: Upgrade gbif parser to 3.1.
authorRay Lee <rhlee@berkeley.edu>
Thu, 28 Jun 2018 02:09:42 +0000 (19:09 -0700)
committerRay Lee <rhlee@berkeley.edu>
Thu, 28 Jun 2018 04:17:13 +0000 (21:17 -0700)
services/common-api/pom.xml
services/common-api/src/main/java/org/collectionspace/services/common/api/TaxonFormatter.java

index 8eb0041c8d2a9244f494b8de8f1002b8279812f4..3cfe4b815ee7003a0b05611af2c02b9e35784eb5 100644 (file)
             <artifactId>commons-lang3</artifactId>
             <version>3.1</version>
         </dependency>
-    
+
         <dependency>
             <groupId>org.gbif</groupId>
             <artifactId>name-parser</artifactId>
-            <version>2.0</version>
+            <version>3.1</version>
         </dependency>
         <dependency>
             <groupId>org.gbif</groupId>
-            <artifactId>gbif-api</artifactId>
-            <version>0.1</version>
+            <artifactId>name-parser-api</artifactId>
+            <version>3.1</version>
         </dependency>
         <dependency>
             <groupId>org.slf4j</groupId>
@@ -44,7 +44,7 @@
             <artifactId>testng</artifactId>
             <scope>provided</scope>
         </dependency>
-        
+
         <!-- Required for XPath processing using dom4j in XmlTools class -->
         <dependency>
             <groupId>jaxen</groupId>
index 215ec6f70d43dea53dc9d8868ff4db8a092795c4..9c901c5847a620ee935c7870c80384f864727b4b 100644 (file)
@@ -5,9 +5,9 @@ import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
 import org.apache.commons.lang3.StringUtils;
-import org.gbif.api.model.checklistbank.ParsedName;
-import org.gbif.nameparser.NameParser;
-import org.gbif.nameparser.UnparsableException;
+import org.gbif.nameparser.api.ParsedName;
+import org.gbif.nameparser.NameParserGBIF;
+import org.gbif.nameparser.api.UnparsableNameException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -21,58 +21,58 @@ public class TaxonFormatter {
     private static final Pattern SUBSPECIES_WITH_QUALIFIER_PATTERN = Pattern.compile("(\\s|^)(subsp\\.\\s+)" + SUBSPECIES_QUALIFIER_MARKER_REGEXP + "(\\s)(.*?)(\\s|$)", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
     private static final Pattern PARENTHESIZED_SUBSPECIES_WITH_QUALIFIER_PATTERN = Pattern.compile("(\\s|^)(subsp\\.\\s+)\\(" + SUBSPECIES_QUALIFIER_MARKER_REGEXP + "(.*?)\\)", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
     private static final Pattern FAMILY_NAME_PATTERN = Pattern.compile("^[A-Z]+$");
-    
-    private NameParser nameParser;
-    
+
+    private NameParserGBIF nameParser;
+
     public TaxonFormatter() {
-       this.nameParser = new NameParser();
+       this.nameParser = new NameParserGBIF();
     }
-    
+
        public String format(String name) {
                if (StringUtils.isBlank(name)) {
                        return name;
                }
-               
+
                if (FAMILY_NAME_PATTERN.matcher(name).matches()) {
-                       // Per Barbara Keller, family names are never italicized. 
+                       // Per Barbara Keller, family names are never italicized.
                        return name;
                }
 
                Matcher hybridMatcher = HYBRID_FORMULA_PATTERN.matcher(name);
-               
+
                if (hybridMatcher.matches()) {
                        String parentName1 = hybridMatcher.group(1);
                        String separator = hybridMatcher.group(2);
                        String parentName2 = hybridMatcher.group(3);
-                       
+
                        logger.info("hybrid formula: parentName1=" + parentName1 + " parentName2=" + parentName2);
-                       
+
                        return (format(parentName1) + separator + format(parentName2));
                }
-               
+
                String normalizedName = name;
 
                if (BROKEN_HYBRID_FORMULA_PATTERN.matcher(normalizedName).find()) {
                        logger.info("broken hybrid: name=" + name + " normalizedName=" + normalizedName);
-                       
-                       normalizedName = BROKEN_HYBRID_FORMULA_PATTERN.matcher(normalizedName).replaceAll("");                  
+
+                       normalizedName = BROKEN_HYBRID_FORMULA_PATTERN.matcher(normalizedName).replaceAll("");
                        logger.info("normalized to:" + normalizedName);
                }
-               
+
                if (PARENTHESIZED_SUBSPECIES_WITH_QUALIFIER_PATTERN.matcher(normalizedName).find()) {
                        logger.info("parenthesized qualified subspecies: name=" + name + " normalizedName=" + normalizedName);
-                       
+
                        normalizedName = PARENTHESIZED_SUBSPECIES_WITH_QUALIFIER_PATTERN.matcher(normalizedName).replaceFirst("$1$2$3$4");
                        logger.info("normalized to:" + normalizedName);
                }
 
                Matcher subspeciesWithQualifierMatcher = SUBSPECIES_WITH_QUALIFIER_PATTERN.matcher(normalizedName);
-               
+
                if (subspeciesWithQualifierMatcher.find()) {
                        logger.info("qualified subspecies: name=" + name + " normalizedName=" + normalizedName);
-                       
+
                        MatchResult matchResult = subspeciesWithQualifierMatcher.toMatchResult();
-                       
+
                        // Remove the qualifier (e.g. section, ser., sser.). In some data from SAGE, the latin name
                        // following the qualifier is capitalized, which the GBIF parser won't handle, so lowercase it.
                        String replacement = matchResult.group(1) + matchResult.group(2) + matchResult.group(5).toLowerCase() + matchResult.group(6);
@@ -83,71 +83,71 @@ public class TaxonFormatter {
                if (STARTS_WITH_INFRASPECIFIC_RANK_PATTERN.matcher(normalizedName).find()) {
                        /*
                         * There are some non-standard taxon names in SAGE data, where there is an infraspecific rank/epithet, but no genus/species, e.g.
-                        *     subsp. occidentalis (J.T. Howell) C.B. Wolf 
-                        * 
+                        *     subsp. occidentalis (J.T. Howell) C.B. Wolf
+                        *
                         * Since the GBIF parser can't handle this, we'll temporarily prepend an arbitrary genus and species for parsing purposes.
                         */
                        logger.info("name starts with infraspecific rank: name=" + name + " normalizedName=" + normalizedName);
-                       
+
                        normalizedName = "Tempgenus tempspecies " + normalizedName;
                        logger.info("normalized to:" + normalizedName);
                }
-               
+
                ParsedName parsedName = null;
 
                try {
                        parsedName = nameParser.parse(normalizedName);
                }
-               catch (UnparsableException e) {
+               catch (UnparsableNameException e) {
                        /*
                         *  Some non-standard taxon names in SAGE data have a species, but no genus. Try to account for these by
                         *  temporarily prepending an arbitrary genus.
                         */
-                       
+
                        logger.info("Unparsable name, trying with a temp genus: name=" + name + " normalizedName=" + normalizedName);
-                       
+
                        normalizedName = "Tempgenus " + normalizedName;
-                       
+
                        try {
                                parsedName = nameParser.parse(normalizedName);
                        }
-                       catch (UnparsableException ex) {                
+                       catch (UnparsableNameException ex) {
                                logger.error("error parsing name: name=" + name + " normalizedName=" + normalizedName + " message=" + e.getMessage());
                        }
                }
 
                if (parsedName != null) {
-                       String genusOrAbove = parsedName.getGenusOrAbove();
+                       String genusOrAbove = parsedName.getGenus();
                        String specificEpithet = parsedName.getSpecificEpithet();
-                       String infraSpecificEpithet = parsedName.getInfraSpecificEpithet();
-                       
+                       String infraSpecificEpithet = parsedName.getInfraspecificEpithet();
+
                        logger.debug("parsed name: genusOrAbove=" + genusOrAbove + " specificEpithet=" + specificEpithet + " infraSpecificEpithet=" + infraSpecificEpithet);
-                       
+
                        if (StringUtils.isNotBlank(genusOrAbove)) {
                                name = italicize(name, genusOrAbove);
                        }
-                       
+
                        if (StringUtils.isNotBlank(specificEpithet)) {
                                name = italicize(name, specificEpithet);
                        }
-                       
+
                        if (StringUtils.isNotBlank(infraSpecificEpithet)) {
                                name = italicize(name, infraSpecificEpithet);
                        }
-                       
+
                        name = compressTags(name);
                }
 
                return name;
        }
-       
+
        private String italicize(String string, String substring) {
                return Pattern.compile("(\\s|\\(|^)(" + Pattern.quote(substring) + ")(\\s|\\)|$)", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE).matcher(string).replaceAll("$1<i>$2</i>$3");
        }
-       
+
        private String compressTags(String html) {
                html = ADJACENT_ITALIC_TAG_PATTERN.matcher(html).replaceAll("$1");
-               
+
                return html;
        }
 }