From: Ray Lee <rhlee@berkeley.edu>
Date: Thu, 28 Jun 2018 02:09:42 +0000 (-0700)
Subject: UCJEPS-712: Upgrade gbif parser to 3.1.
X-Git-Url: https://git.aero2k.de/?a=commitdiff_plain;h=8287017ef9f6608cc614cdc280e97c74ce8d1370;p=tmp%2Fjakarta-migration.git

UCJEPS-712: Upgrade gbif parser to 3.1.
---

diff --git a/services/common-api/pom.xml b/services/common-api/pom.xml
index 8eb0041c8..3cfe4b815 100644
--- a/services/common-api/pom.xml
+++ b/services/common-api/pom.xml
@@ -16,16 +16,16 @@
             <artifactId>commons-lang3</artifactId>
             <version>3.1</version>
         </dependency>
-    
+
         <dependency>
             <groupId>org.gbif</groupId>
             <artifactId>name-parser</artifactId>
-            <version>2.0</version>
+            <version>3.1</version>
         </dependency>
         <dependency>
             <groupId>org.gbif</groupId>
-            <artifactId>gbif-api</artifactId>
-            <version>0.1</version>
+            <artifactId>name-parser-api</artifactId>
+            <version>3.1</version>
         </dependency>
         <dependency>
             <groupId>org.slf4j</groupId>
@@ -44,7 +44,7 @@
             <artifactId>testng</artifactId>
             <scope>provided</scope>
         </dependency>
-        
+
         <!-- Required for XPath processing using dom4j in XmlTools class -->
         <dependency>
             <groupId>jaxen</groupId>
diff --git a/services/common-api/src/main/java/org/collectionspace/services/common/api/TaxonFormatter.java b/services/common-api/src/main/java/org/collectionspace/services/common/api/TaxonFormatter.java
index 215ec6f70..9c901c584 100644
--- a/services/common-api/src/main/java/org/collectionspace/services/common/api/TaxonFormatter.java
+++ b/services/common-api/src/main/java/org/collectionspace/services/common/api/TaxonFormatter.java
@@ -5,9 +5,9 @@ import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
 import org.apache.commons.lang3.StringUtils;
-import org.gbif.api.model.checklistbank.ParsedName;
-import org.gbif.nameparser.NameParser;
-import org.gbif.nameparser.UnparsableException;
+import org.gbif.nameparser.api.ParsedName;
+import org.gbif.nameparser.NameParserGBIF;
+import org.gbif.nameparser.api.UnparsableNameException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -21,58 +21,58 @@ public class TaxonFormatter {
     private static final Pattern SUBSPECIES_WITH_QUALIFIER_PATTERN = Pattern.compile("(\\s|^)(subsp\\.\\s+)" + SUBSPECIES_QUALIFIER_MARKER_REGEXP + "(\\s)(.*?)(\\s|$)", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
     private static final Pattern PARENTHESIZED_SUBSPECIES_WITH_QUALIFIER_PATTERN = Pattern.compile("(\\s|^)(subsp\\.\\s+)\\(" + SUBSPECIES_QUALIFIER_MARKER_REGEXP + "(.*?)\\)", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
     private static final Pattern FAMILY_NAME_PATTERN = Pattern.compile("^[A-Z]+$");
-    
-    private NameParser nameParser;
-    
+
+    private NameParserGBIF nameParser;
+
     public TaxonFormatter() {
-    	this.nameParser = new NameParser();
+    	this.nameParser = new NameParserGBIF();
     }
-    
+
 	public String format(String name) {
 		if (StringUtils.isBlank(name)) {
 			return name;
 		}
-		
+
 		if (FAMILY_NAME_PATTERN.matcher(name).matches()) {
-			// Per Barbara Keller, family names are never italicized. 
+			// Per Barbara Keller, family names are never italicized.
 			return name;
 		}
 
 		Matcher hybridMatcher = HYBRID_FORMULA_PATTERN.matcher(name);
-		
+
 		if (hybridMatcher.matches()) {
 			String parentName1 = hybridMatcher.group(1);
 			String separator = hybridMatcher.group(2);
 			String parentName2 = hybridMatcher.group(3);
-			
+
 			logger.info("hybrid formula: parentName1=" + parentName1 + " parentName2=" + parentName2);
-			
+
 			return (format(parentName1) + separator + format(parentName2));
 		}
-		
+
 		String normalizedName = name;
 
 		if (BROKEN_HYBRID_FORMULA_PATTERN.matcher(normalizedName).find()) {
 			logger.info("broken hybrid: name=" + name + " normalizedName=" + normalizedName);
-			
-			normalizedName = BROKEN_HYBRID_FORMULA_PATTERN.matcher(normalizedName).replaceAll("");			
+
+			normalizedName = BROKEN_HYBRID_FORMULA_PATTERN.matcher(normalizedName).replaceAll("");
 			logger.info("normalized to:" + normalizedName);
 		}
-		
+
 		if (PARENTHESIZED_SUBSPECIES_WITH_QUALIFIER_PATTERN.matcher(normalizedName).find()) {
 			logger.info("parenthesized qualified subspecies: name=" + name + " normalizedName=" + normalizedName);
-			
+
 			normalizedName = PARENTHESIZED_SUBSPECIES_WITH_QUALIFIER_PATTERN.matcher(normalizedName).replaceFirst("$1$2$3$4");
 			logger.info("normalized to:" + normalizedName);
 		}
 
 		Matcher subspeciesWithQualifierMatcher = SUBSPECIES_WITH_QUALIFIER_PATTERN.matcher(normalizedName);
-		
+
 		if (subspeciesWithQualifierMatcher.find()) {
 			logger.info("qualified subspecies: name=" + name + " normalizedName=" + normalizedName);
-			
+
 			MatchResult matchResult = subspeciesWithQualifierMatcher.toMatchResult();
-			
+
 			// Remove the qualifier (e.g. section, ser., sser.). In some data from SAGE, the latin name
 			// following the qualifier is capitalized, which the GBIF parser won't handle, so lowercase it.
 			String replacement = matchResult.group(1) + matchResult.group(2) + matchResult.group(5).toLowerCase() + matchResult.group(6);
@@ -83,71 +83,71 @@ public class TaxonFormatter {
 		if (STARTS_WITH_INFRASPECIFIC_RANK_PATTERN.matcher(normalizedName).find()) {
 			/*
 			 * There are some non-standard taxon names in SAGE data, where there is an infraspecific rank/epithet, but no genus/species, e.g.
-			 *     subsp. occidentalis (J.T. Howell) C.B. Wolf 
-			 * 
+			 *     subsp. occidentalis (J.T. Howell) C.B. Wolf
+			 *
 			 * Since the GBIF parser can't handle this, we'll temporarily prepend an arbitrary genus and species for parsing purposes.
 			 */
 			logger.info("name starts with infraspecific rank: name=" + name + " normalizedName=" + normalizedName);
-			
+
 			normalizedName = "Tempgenus tempspecies " + normalizedName;
 			logger.info("normalized to:" + normalizedName);
 		}
-		
+
 		ParsedName parsedName = null;
 
 		try {
 			parsedName = nameParser.parse(normalizedName);
 		}
-		catch (UnparsableException e) {
+		catch (UnparsableNameException e) {
 			/*
 			 *  Some non-standard taxon names in SAGE data have a species, but no genus. Try to account for these by
 			 *  temporarily prepending an arbitrary genus.
 			 */
-			
+
 			logger.info("Unparsable name, trying with a temp genus: name=" + name + " normalizedName=" + normalizedName);
-			
+
 			normalizedName = "Tempgenus " + normalizedName;
-			
+
 			try {
 				parsedName = nameParser.parse(normalizedName);
 			}
-			catch (UnparsableException ex) {		
+			catch (UnparsableNameException ex) {
 				logger.error("error parsing name: name=" + name + " normalizedName=" + normalizedName + " message=" + e.getMessage());
 			}
 		}
 
 		if (parsedName != null) {
-			String genusOrAbove = parsedName.getGenusOrAbove();
+			String genusOrAbove = parsedName.getGenus();
 			String specificEpithet = parsedName.getSpecificEpithet();
-			String infraSpecificEpithet = parsedName.getInfraSpecificEpithet();
-			
+			String infraSpecificEpithet = parsedName.getInfraspecificEpithet();
+
 			logger.debug("parsed name: genusOrAbove=" + genusOrAbove + " specificEpithet=" + specificEpithet + " infraSpecificEpithet=" + infraSpecificEpithet);
-			
+
 			if (StringUtils.isNotBlank(genusOrAbove)) {
 				name = italicize(name, genusOrAbove);
 			}
-			
+
 			if (StringUtils.isNotBlank(specificEpithet)) {
 				name = italicize(name, specificEpithet);
 			}
-			
+
 			if (StringUtils.isNotBlank(infraSpecificEpithet)) {
 				name = italicize(name, infraSpecificEpithet);
 			}
-			
+
 			name = compressTags(name);
 		}
 
 		return name;
 	}
-	
+
 	private String italicize(String string, String substring) {
 		return Pattern.compile("(\\s|\\(|^)(" + Pattern.quote(substring) + ")(\\s|\\)|$)", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE).matcher(string).replaceAll("$1<i>$2</i>$3");
 	}
-	
+
 	private String compressTags(String html) {
 		html = ADJACENT_ITALIC_TAG_PATTERN.matcher(html).replaceAll("$1");
-		
+
 		return html;
 	}
 }