UCJEPS-712: Upgrade gbif parser to 3.1.

author Ray Lee <rhlee@berkeley.edu>

Thu, 28 Jun 2018 02:09:42 +0000 (19:09 -0700)

committer Ray Lee <rhlee@berkeley.edu>

Thu, 28 Jun 2018 04:17:13 +0000 (21:17 -0700)
author Ray Lee <rhlee@berkeley.edu>
Thu, 28 Jun 2018 02:09:42 +0000 (19:09 -0700)
committer Ray Lee <rhlee@berkeley.edu>
Thu, 28 Jun 2018 04:17:13 +0000 (21:17 -0700)
diff --git a/services/common-api/pom.xml b/services/common-api/pom.xml

index 8eb0041c8d2a9244f494b8de8f1002b8279812f4..3cfe4b815ee7003a0b05611af2c02b9e35784eb5 100644 (file)
--- a/services/common-api/pom.xml
+++ b/services/common-api/pom.xml
@@ -16,16 +16,16 @@
              <artifactId>commons-lang3</artifactId>
              <version>3.1</version>
          </dependency>
-    
+
          <dependency>
              <groupId>org.gbif</groupId>
              <artifactId>name-parser</artifactId>
-            <version>2.0</version>
+            <version>3.1</version>
          </dependency>
          <dependency>
              <groupId>org.gbif</groupId>
-            <artifactId>gbif-api</artifactId>
-            <version>0.1</version>
+            <artifactId>name-parser-api</artifactId>
+            <version>3.1</version>
          </dependency>
          <dependency>
              <groupId>org.slf4j</groupId>
@@ -44,7 +44,7 @@
              <artifactId>testng</artifactId>
              <scope>provided</scope>
          </dependency>
-        
+
          <!-- Required for XPath processing using dom4j in XmlTools class -->
          <dependency>
              <groupId>jaxen</groupId>
diff --git a/services/common-api/src/main/java/org/collectionspace/services/common/api/TaxonFormatter.java b/services/common-api/src/main/java/org/collectionspace/services/common/api/TaxonFormatter.java

index 215ec6f70d43dea53dc9d8868ff4db8a092795c4..9c901c5847a620ee935c7870c80384f864727b4b 100644 (file)
--- a/services/common-api/src/main/java/org/collectionspace/services/common/api/TaxonFormatter.java
+++ b/services/common-api/src/main/java/org/collectionspace/services/common/api/TaxonFormatter.java
@@ -5,9 +5,9 @@ import java.util.regex.Matcher;
  import java.util.regex.Pattern;
  
  import org.apache.commons.lang3.StringUtils;
-import org.gbif.api.model.checklistbank.ParsedName;
-import org.gbif.nameparser.NameParser;
-import org.gbif.nameparser.UnparsableException;
+import org.gbif.nameparser.api.ParsedName;
+import org.gbif.nameparser.NameParserGBIF;
+import org.gbif.nameparser.api.UnparsableNameException;
  import org.slf4j.Logger;
  import org.slf4j.LoggerFactory;
  
@@ -21,58 +21,58 @@ public class TaxonFormatter {
      private static final Pattern SUBSPECIES_WITH_QUALIFIER_PATTERN = Pattern.compile("(\\s|^)(subsp\\.\\s+)" + SUBSPECIES_QUALIFIER_MARKER_REGEXP + "(\\s)(.*?)(\\s|$)", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
      private static final Pattern PARENTHESIZED_SUBSPECIES_WITH_QUALIFIER_PATTERN = Pattern.compile("(\\s|^)(subsp\\.\\s+)\\(" + SUBSPECIES_QUALIFIER_MARKER_REGEXP + "(.*?)\\)", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
      private static final Pattern FAMILY_NAME_PATTERN = Pattern.compile("^[A-Z]+$");
-    
-    private NameParser nameParser;
-    
+
+    private NameParserGBIF nameParser;
+
      public TaxonFormatter() {
-       this.nameParser = new NameParser();
+       this.nameParser = new NameParserGBIF();
      }
-    
+
         public String format(String name) {
                 if (StringUtils.isBlank(name)) {
                         return name;
                 }
-               
+
                 if (FAMILY_NAME_PATTERN.matcher(name).matches()) {
-                       // Per Barbara Keller, family names are never italicized. 
+                       // Per Barbara Keller, family names are never italicized.
                         return name;
                 }
  
                 Matcher hybridMatcher = HYBRID_FORMULA_PATTERN.matcher(name);
-               
+
                 if (hybridMatcher.matches()) {
                         String parentName1 = hybridMatcher.group(1);
                         String separator = hybridMatcher.group(2);
                         String parentName2 = hybridMatcher.group(3);
-                       
+
                         logger.info("hybrid formula: parentName1=" + parentName1 + " parentName2=" + parentName2);
-                       
+
                         return (format(parentName1) + separator + format(parentName2));
                 }
-               
+
                 String normalizedName = name;
  
                 if (BROKEN_HYBRID_FORMULA_PATTERN.matcher(normalizedName).find()) {
                         logger.info("broken hybrid: name=" + name + " normalizedName=" + normalizedName);
-                       
-                       normalizedName = BROKEN_HYBRID_FORMULA_PATTERN.matcher(normalizedName).replaceAll("");                  
+
+                       normalizedName = BROKEN_HYBRID_FORMULA_PATTERN.matcher(normalizedName).replaceAll("");
                         logger.info("normalized to:" + normalizedName);
                 }
-               
+
                 if (PARENTHESIZED_SUBSPECIES_WITH_QUALIFIER_PATTERN.matcher(normalizedName).find()) {
                         logger.info("parenthesized qualified subspecies: name=" + name + " normalizedName=" + normalizedName);
-                       
+
                         normalizedName = PARENTHESIZED_SUBSPECIES_WITH_QUALIFIER_PATTERN.matcher(normalizedName).replaceFirst("$1$2$3$4");
                         logger.info("normalized to:" + normalizedName);
                 }
  
                 Matcher subspeciesWithQualifierMatcher = SUBSPECIES_WITH_QUALIFIER_PATTERN.matcher(normalizedName);
-               
+
                 if (subspeciesWithQualifierMatcher.find()) {
                         logger.info("qualified subspecies: name=" + name + " normalizedName=" + normalizedName);
-                       
+
                         MatchResult matchResult = subspeciesWithQualifierMatcher.toMatchResult();
-                       
+
                         // Remove the qualifier (e.g. section, ser., sser.). In some data from SAGE, the latin name
                         // following the qualifier is capitalized, which the GBIF parser won't handle, so lowercase it.
                         String replacement = matchResult.group(1) + matchResult.group(2) + matchResult.group(5).toLowerCase() + matchResult.group(6);
@@ -83,71 +83,71 @@ public class TaxonFormatter {
                 if (STARTS_WITH_INFRASPECIFIC_RANK_PATTERN.matcher(normalizedName).find()) {
                         /*
                          * There are some non-standard taxon names in SAGE data, where there is an infraspecific rank/epithet, but no genus/species, e.g.
-                        *     subsp. occidentalis (J.T. Howell) C.B. Wolf 
-                        * 
+                        *     subsp. occidentalis (J.T. Howell) C.B. Wolf
+                        *
                          * Since the GBIF parser can't handle this, we'll temporarily prepend an arbitrary genus and species for parsing purposes.
                          */
                         logger.info("name starts with infraspecific rank: name=" + name + " normalizedName=" + normalizedName);
-                       
+
                         normalizedName = "Tempgenus tempspecies " + normalizedName;
                         logger.info("normalized to:" + normalizedName);
                 }
-               
+
                 ParsedName parsedName = null;
  
                 try {
                         parsedName = nameParser.parse(normalizedName);
                 }
-               catch (UnparsableException e) {
+               catch (UnparsableNameException e) {
                         /*
                          *  Some non-standard taxon names in SAGE data have a species, but no genus. Try to account for these by
                          *  temporarily prepending an arbitrary genus.
                          */
-                       
+
                         logger.info("Unparsable name, trying with a temp genus: name=" + name + " normalizedName=" + normalizedName);
-                       
+
                         normalizedName = "Tempgenus " + normalizedName;
-                       
+
                         try {
                                 parsedName = nameParser.parse(normalizedName);
                         }
-                       catch (UnparsableException ex) {                
+                       catch (UnparsableNameException ex) {
                                 logger.error("error parsing name: name=" + name + " normalizedName=" + normalizedName + " message=" + e.getMessage());
                         }
                 }
  
                 if (parsedName != null) {
-                       String genusOrAbove = parsedName.getGenusOrAbove();
+                       String genusOrAbove = parsedName.getGenus();
                         String specificEpithet = parsedName.getSpecificEpithet();
-                       String infraSpecificEpithet = parsedName.getInfraSpecificEpithet();
-                       
+                       String infraSpecificEpithet = parsedName.getInfraspecificEpithet();
+
                         logger.debug("parsed name: genusOrAbove=" + genusOrAbove + " specificEpithet=" + specificEpithet + " infraSpecificEpithet=" + infraSpecificEpithet);
-                       
+
                         if (StringUtils.isNotBlank(genusOrAbove)) {
                                 name = italicize(name, genusOrAbove);
                         }
-                       
+
                         if (StringUtils.isNotBlank(specificEpithet)) {
                                 name = italicize(name, specificEpithet);
                         }
-                       
+
                         if (StringUtils.isNotBlank(infraSpecificEpithet)) {
                                 name = italicize(name, infraSpecificEpithet);
                         }
-                       
+
                         name = compressTags(name);
                 }
  
                 return name;
         }
-       
+
         private String italicize(String string, String substring) {
                 return Pattern.compile("(\\s|\\(|^)(" + Pattern.quote(substring) + ")(\\s|\\)|$)", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE).matcher(string).replaceAll("$1<i>$2</i>$3");
         }
-       
+
         private String compressTags(String html) {
                 html = ADJACENT_ITALIC_TAG_PATTERN.matcher(html).replaceAll("$1");
-               
+
                 return html;
         }
  }
author	Ray Lee <rhlee@berkeley.edu>
	Thu, 28 Jun 2018 02:09:42 +0000 (19:09 -0700)
committer	Ray Lee <rhlee@berkeley.edu>
	Thu, 28 Jun 2018 04:17:13 +0000 (21:17 -0700)
services/common-api/pom.xml		patch \| blob \| history
services/common-api/src/main/java/org/collectionspace/services/common/api/TaxonFormatter.java		patch \| blob \| history