]> git.aero2k.de Git - tmp/jakarta-migration.git/commitdiff
HM-19:cesarvh: Improved BC date range precision, added carbon dating support.
authorRichard Millet <remillet@gmail.com>
Tue, 26 Feb 2019 20:01:59 +0000 (12:01 -0800)
committercesarvh <cesarv.h@berkeley.edu>
Tue, 19 Mar 2019 19:41:04 +0000 (12:41 -0700)
services/structureddate/structureddate/src/main/antlr4/org/collectionspace/services/structureddate/antlr/StructuredDate.g4
services/structureddate/structureddate/src/main/java/org/collectionspace/services/structureddate/DateUtils.java
services/structureddate/structureddate/src/main/java/org/collectionspace/services/structureddate/antlr/ANTLRStructuredDateEvaluator.java
services/structureddate/structureddate/src/test/java/org/collectionspace/services/structureddate/StructuredDateEvaluatorTest.java
services/structureddate/structureddate/src/test/resources/test-dates.yaml

index 5bd8e4f57fa6814d8d759d840c1ba73bf7dada95..fca4433b6daa826fd38e4f41e9fa08a9c9e819a8 100644 (file)
@@ -15,6 +15,7 @@ displayDate:           uncertainDate
 |                      certainDate
 |                      beforeOrAfterDate
 |                      unknownDate
+|                      uncalibratedDate
 ;
 
 uncertainDate:         CIRCA certainDate ;
@@ -25,6 +26,8 @@ certainDate:           hyphenatedRange
 
 beforeOrAfterDate:     ( BEFORE | AFTER ) singleInterval ;
 
+uncalibratedDate:      numYear PLUSMINUS num YEARSSTRING? BP ;
+
 hyphenatedRange:       singleInterval ( HYPHEN | DASH ) singleInterval
 |                      nthCenturyRange
 |                      monthInYearRange
@@ -134,8 +137,8 @@ unknownDate:           UNKNOWN ;
 /*
  * Lexer rules
  */
-
 WS:             [ \t\r\n]+ -> skip;
+PLUSMINUS:      '±' | '+/-' ;
 CIRCA:          ('c' | 'ca') DOT? | 'circa' ;
 SPRING:         'spring' | 'spr' ;
 SUMMER:         'summer' | 'sum' ;
@@ -158,11 +161,12 @@ MILLENNIUM:     'millennium' ;
 MONTH:          'january' | 'february' | 'march' | 'april' | 'may' | 'june' | 'july' | 'august' | 'september' | 'october' | 'november' | 'december' ;
 SHORTMONTH:     'jan' | 'feb' | 'mar' | 'apr' | 'jun' | 'jul' | 'aug' | 'sep' | 'sept' | 'oct' | 'nov' | 'dec' ;
 BC:             'bc' | 'bce' |  'b.c.' | 'b.c.e.' ;
-AD:             'ad' | 'a.d.' | 'ce' | 'c.e.';
+AD:             'ad' | 'a.d.' | 'ce' | 'c.e.' ;
+BP:             'bp' | 'b.p.' | 'b.p' ;
 NTHSTR:         [0-9]*? ([0456789] 'th' | '1st' | '2nd' | '3rd' | '11th' | '12th' | '13th') ;
 HUNDREDS:       [0-9]*? '00' '\''? 's';
 TENS:           [0-9]*? '0' '\''? 's';
-NUMBER:         [0-9]+ ;
+NUMBER:         ([0-9,]+)*[0-9] ;
 COMMA:          ',' ;
 HYPHEN:         '-' ;
 DASH:           [—–] ; /* EM DASH, EN DASH */
@@ -171,4 +175,5 @@ DOT:            '.' ;
 QUESTION:       '?' ;
 OTHER:          . ;
 UNKNOWN:        'unknown' ;
+YEARSSTRING:    'years' | 'year' ;
 STRING:         [a-z]+ ;
index 67c1abc0d9ea045b2f1c3ae3ccaba60645359a61..e3b6687a8c6dd106985ccdbc3e3634306395bf55 100644 (file)
@@ -1120,16 +1120,33 @@ public class DateUtils {
                        return currentDate;
                }
 
-               MutableDateTime currentDateTime = convertToDateTime(currentDate);
-               MutableDateTime endDateTime = convertToDateTime(endDate);
-               
-               int comparisonResult = currentDateTime.compareTo(endDateTime);
+               int comparisonResult = compareDates(currentDate, endDate);
                if (comparisonResult == 1 || comparisonResult == 0) {
                        return currentDate;
                }
                return null;
        }
 
+       /**
+        * Wrapper function for MutableDateTime's comparator.
+        * @param startDate The first date in the range
+        * @param endDate   The last date in the range
+        * @return          -1 if startDate is before, 0 if they are equal, 1 if startDate is after endDate
+        */
+       public static int compareDates(Date startDate, Date endDate) {
+               if (startDate.getYear() == null || endDate.getYear() == null) {
+                       throw new IllegalArgumentException("Must provide a start and end date to compare.");
+               }
+
+               MutableDateTime startDateTime = convertToDateTime(startDate);
+               MutableDateTime endDateTime = convertToDateTime(endDate);
+               
+               return startDateTime.compareTo(endDateTime);
+       }
+
+       /**
+        * Returns a Date object based on the local date.
+        */
        public static Date getCurrentDate() {
                LocalDate localDate = new LocalDate();
                Integer year = (Integer) localDate.getYear();
@@ -1190,6 +1207,22 @@ public class DateUtils {
                if (era == null) {
                        era = Date.DEFAULT_ERA;
                }
+
+               if (era == Era.BCE) {
+                       // Improved precision for BC dates
+                       int interval = 0;
+
+                       if (year % 1000 == 0) {
+                               interval = 500;
+                       } else if (year % 100 == 0) {
+                               interval = 50;
+                       } else if (year % 10 == 0) {
+                               interval = 10;
+                       } else if (year % 10 > 0 && year % 10 < 10) {
+                               interval = 5;
+                       }
+                       return interval;
+               }
                
                MutableDateTime dateTime = new MutableDateTime(chronology);
                dateTime.era().set((era == Era.BCE) ? DateTimeConstants.BC : DateTimeConstants.AD);
@@ -1200,6 +1233,8 @@ public class DateUtils {
                
                int years = Years.yearsBetween(dateTime, circaBaseDateTime).getYears();
 
+
+               // return interval;
                return ((int) Math.round(years * 0.05));
        }
        
index 5225d75d38e8904befc6da00152bd1e0b950e488..8fe852efc0fe2d8f2a1cb6dca15a9f32dfeeb28a 100644 (file)
@@ -1,5 +1,7 @@
 package org.collectionspace.services.structureddate.antlr;
 
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 import java.util.Stack;
 
 import org.antlr.v4.runtime.ANTLRInputStream;
@@ -85,6 +87,7 @@ import org.collectionspace.services.structureddate.antlr.StructuredDateParser.St
 import org.collectionspace.services.structureddate.antlr.StructuredDateParser.StrMonthContext;
 import org.collectionspace.services.structureddate.antlr.StructuredDateParser.StrSeasonContext;
 import org.collectionspace.services.structureddate.antlr.StructuredDateParser.StrSeasonInYearRangeContext;
+import org.collectionspace.services.structureddate.antlr.StructuredDateParser.UncalibratedDateContext;
 import org.collectionspace.services.structureddate.antlr.StructuredDateParser.UncertainDateContext;
 import org.collectionspace.services.structureddate.antlr.StructuredDateParser.UnknownDateContext;
 import org.collectionspace.services.structureddate.antlr.StructuredDateParser.YearContext;
@@ -118,8 +121,7 @@ public class ANTLRStructuredDateEvaluator extends StructuredDateBaseListener imp
                result = new StructuredDateInternal();
                result.setDisplayDate(displayDate);
 
-               // Instantiate a parser from the lowercased display date, so that parsing will be
-               // case insensitive.
+               // Instantiate a parser from the lowercased display date, so that parsing will be case insensitive 
                ANTLRInputStream inputStream = new ANTLRInputStream(displayDate.toLowerCase());
                StructuredDateLexer lexer = new StructuredDateLexer(inputStream);
                CommonTokenStream tokenStream = new CommonTokenStream(lexer);
@@ -157,6 +159,27 @@ public class ANTLRStructuredDateEvaluator extends StructuredDateBaseListener imp
                Date latestDate = (Date) stack.pop();
                Date earliestDate = (Date) stack.pop();
 
+               if (earliestDate.getYear() != null || earliestDate.getYear() != null) {
+                       int compareResult = DateUtils.compareDates(earliestDate, latestDate);
+                       if (compareResult == 1) {
+                               Date temp;
+                               temp = earliestDate;
+                               earliestDate = latestDate;
+                               latestDate = temp;
+       
+                               // Check to see if the dates were reversed AND calculated. If they were
+                               // Then this probably means the absolute earliestDate should have month and day as "1"
+                               // and the latestDate momth 12, day 31.
+                               if ((earliestDate.getMonth() == 12 && earliestDate.getDay() == 31) &&
+                                       (latestDate.getMonth() == 1 && latestDate.getDay() == 1)) {
+                                               earliestDate.setMonth(1);
+                                               earliestDate.setDay(1);
+                                               latestDate.setMonth(12);
+                                               latestDate.setDay(31);
+                                       }
+                       }
+               }
+
                // If the earliest date and the latest date are the same, it's just a "single" date.
                // There's no need to have the latest, so set it to null.
 
@@ -217,16 +240,17 @@ public class ANTLRStructuredDateEvaluator extends StructuredDateBaseListener imp
                Date latestDate = (Date) stack.pop();
                Date earliestDate = (Date) stack.pop();
 
+
                int earliestInterval = DateUtils.getCircaIntervalYears(earliestDate.getYear(), earliestDate.getEra());
                int latestInterval = DateUtils.getCircaIntervalYears(latestDate.getYear(), latestDate.getEra());
 
-               // Express the circa interval as a qualifier.
-
-               // stack.push(earliestDate.withQualifier(QualifierType.MINUS, earliestInterval, QualifierUnit.YEARS));
-               // stack.push(latestDate.withQualifier(QualifierType.PLUS, latestInterval, QualifierUnit.YEARS));
+               // Express the circa interval as a qualifier.   
 
-               // OR:
+               // stack.push(earliestDate.withQualifier(QualifierType.MINUS, earliestInterval, QualifierUnit.YEARS));  
+               // stack.push(latestDate.withQualifier(QualifierType.PLUS, latestInterval, QualifierUnit.YEARS));       
 
+               // OR:  
+                
                // Express the circa interval as an offset calculated into the year.
 
                DateUtils.subtractYears(earliestDate, earliestInterval);
@@ -930,7 +954,7 @@ public class ANTLRStructuredDateEvaluator extends StructuredDateBaseListener imp
                // Convert the string to a number,
                // and push on the stack.
 
-               Integer year = new Integer(ctx.NUMBER().getText());
+               Integer year = new Integer(ctx.getText().replaceAll(",", ""));
 
                if (year == 0) {
                        throw new StructuredDateFormatException("unexpected year '" + ctx.NUMBER().getText() + "'");
@@ -1177,7 +1201,7 @@ public class ANTLRStructuredDateEvaluator extends StructuredDateBaseListener imp
                // Convert the numeric string to an Integer,
                // and push on the stack.
 
-               Integer num = new Integer(ctx.NUMBER().getText());
+               Integer num = new Integer(ctx.getText().replaceAll(",", ""));
 
                stack.push(num);
        }
@@ -1191,6 +1215,29 @@ public class ANTLRStructuredDateEvaluator extends StructuredDateBaseListener imp
                stack.push(new Date());
        }
 
+       public void exitUncalibratedDate(UncalibratedDateContext ctx) {
+               if (ctx.exception != null) return;
+
+               Integer adjustmentDate = (Integer) stack.pop();
+               Integer mainYear = (Integer) stack.pop();
+
+               Integer upperBound = mainYear + adjustmentDate;
+               Integer lowerBound = mainYear - adjustmentDate;
+
+               Integer currentYear = DateUtils.getCurrentDate().getYear();
+
+               Integer earliestYear = currentYear - upperBound;
+               Integer latestYear = currentYear - lowerBound ;
+
+               // If negative, then BC, else AD
+               Era earliestEra = earliestYear < 0 ? Era.BCE : Era.CE;
+               Era latestEra = latestYear < 0 ? Era.BCE : Era.CE;
+
+               stack.push(new Date(Math.abs(earliestYear), 1, 1, earliestEra)); // Earliest Early Date
+               stack.push(new Date(Math.abs(latestYear), 12, DateUtils.getDaysInMonth(12, Math.abs(latestYear), latestEra), latestEra)); // Latest Late Date
+
+       }
+
        protected String getErrorMessage(RecognitionException re) {
                String message = "";
 
index 039252b8be19f29eabf30f984f6b72ca01f06cd0..3f1befd7b152a7373653f4285605b3c707b1eb29 100644 (file)
@@ -7,6 +7,7 @@ import java.util.Arrays;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
+import java.util.Stack;
 
 import org.apache.commons.beanutils.PropertyUtils;
 import org.slf4j.Logger;
@@ -57,14 +58,19 @@ public class StructuredDateEvaluatorTest {
                if (structuredDateFields != null && structuredDateFields.containsKey("latestDate")) {
                        Object latestDate = structuredDateFields.get("latestDate");
                        if (latestDate instanceof String) {
+                               Date currentDate = DateUtils.getCurrentDate();
+                               ArrayList latestDateItems = new ArrayList<>();
                                if (latestDate.equals("current date")) {
-                                       ArrayList items = new ArrayList<>();
-                                       Date currentDate = DateUtils.getCurrentDate();
-                                       items.add(currentDate.getYear());
-                                       items.add(currentDate.getMonth());
-                                       items.add(currentDate.getDay());
-                                       items.add(currentDate.getEra() == Era.BCE ? "BCE" : "CE");
-                                       structuredDateFields.put("latestDate", items);
+                                       latestDateItems.add(currentDate.getYear());
+                                       latestDateItems.add(currentDate.getMonth());
+                                       latestDateItems.add(currentDate.getDay());
+                                       latestDateItems.add(currentDate.getEra() == Era.BCE ? "BCE" : "CE");
+                                       structuredDateFields.put("latestDate", latestDateItems);
+                               }
+                               if (latestDate.equals("uncalibrated latest date")) {
+                                       Stack<ArrayList> results = calculateUncalibratedDate(displayDate, currentDate.getYear());
+                                       structuredDateFields.put("latestDate", results.pop());
+                                       structuredDateFields.put("earliestSingleDate", results.pop());
                                }
                        }
                }
@@ -103,6 +109,59 @@ public class StructuredDateEvaluatorTest {
                return structuredDate;
        }
 
+
+       /** 
+        * Calculates the uncalibrated date, since the yalm expected dates need to be dynamic
+        * as they will change from year to year. 
+        * @param displayDate The current test's display date
+        * @param currentYear The current year
+        * 
+        * @return a stack consisting of two ArrayLists, each containing the expected dates
+       */
+       public Stack<ArrayList> calculateUncalibratedDate(String displayDate, Integer currentYear) {
+               Stack<ArrayList> stack = new Stack<ArrayList>();
+               ArrayList latestDate = new ArrayList<>();
+               ArrayList earliestDate = new ArrayList<>();
+
+
+               String reg = "±|\\+/-";
+               String[] splitDateTokens = displayDate.split(reg);
+               String[] tokensPartTwo = splitDateTokens[1].split(" ");
+
+               Integer mainYear = Integer.parseInt(splitDateTokens[0].replaceAll("\\s|,", ""));
+               Integer offset;
+
+               try {
+                       offset = Integer.parseInt(tokensPartTwo[0]);
+               } catch (Exception e) {
+                       offset = Integer.parseInt(tokensPartTwo[1].replaceAll("\\s|,", ""));
+               }
+
+               Integer earliestYear = currentYear - (mainYear + offset);
+               Integer latestYear   = currentYear - (mainYear - offset);
+               
+               String earliestEra = earliestYear < 0 ? "BCE" : "CE";
+               String latestEra = latestYear < 0 ? "BCE" : "CE";
+               
+               earliestYear = Math.abs(earliestYear);
+               latestYear = Math.abs(latestYear);
+
+               latestDate.add(latestYear);
+               latestDate.add(12);
+               latestDate.add(DateUtils.getDaysInMonth(12, latestYear, null));
+               latestDate.add(latestEra);
+
+               earliestDate.add(earliestYear);
+               earliestDate.add(1);
+               earliestDate.add(1);
+               earliestDate.add(earliestEra);
+
+               stack.push(earliestDate);
+               stack.push(latestDate);
+
+               return stack;
+       }
+
        private Date createDateFromYamlSpec(List<Object> dateFields) {
                Date date = new Date();
                Iterator<Object> fieldIterator = dateFields.iterator();
index 2b7e1925f1b3cdc584a6822ef32da822e4248173..a746dbd7558bd964e1ccc89b24ebea5ff949f6c0 100644 (file)
                                          latestDate:         [2013,  4,  5, CE]
 
   '5/3/1962-4/5/2013 BC':                # hyphenatedRange, date
-                                         earliestSingleDate: [1962,  5,  3, BCE]
-                                         latestDate:         [2013,  4,  5, BCE]
+                                         earliestSingleDate:  [2013,  4,  5, BCE]
+                                         latestDate:          [1962,  5,  3, BCE]
 
   '5/3/1962 BC-4/5/2013':                # hyphenatedRange, date
                                          earliestSingleDate: [1962,  5,  3, BCE]
   #                                      latestDate:         [  10, 12, 31, BCE, null, PLUS,  106, YEARS]
 
   'Circa 10 BC':                         # uncertainDate, year - calculating the uncertainty into the year field
-                                         earliestSingleDate: [ 115,  1,  1, BCE]
-                                         latestDate:         [  96, 12, 31, CE]
+                                         earliestSingleDate: [ 20,  1,  1, BCE]
+                                         latestDate:         [  1, 12, 31, CE]
 
   # 'Circa 10':                          # uncertainDate, year - using qualifier/value/unit fields
   #                                      earliestSingleDate: [  10,  1,  1, CE, null, MINUS, 105, YEARS]
   "13th april, 1995":                    # oneDisplayDate -  singleInterval - dayFirstDate - Day (ordinal) Month Year
                                          earliestSingleDate: [1995,  4,  13, CE]
 
-  "13th april, 1995 - 5th may 1999":    # oneDisplayDate -  hyphenatedRange - dayFirstDate - Day (ordinal) Month Year
+  "13th april, 1995 - 5th may 1999":     # oneDisplayDate -  hyphenatedRange - dayFirstDate - Day (ordinal) Month Year
                                          earliestSingleDate: [1995,  4,  13, CE]
                                          latestDate:         [1999,  5,  5, CE]
 
-  "13 april 15":                        # oneDisplayDate - ambigous day and year - should be Year month day
+  "13 april 15":                         # oneDisplayDate - ambigous day and year - should be Year month day
                                          earliestSingleDate: [13,  4,  15, CE]
 
   "before 13 april 1995":                # beforeAfterDate - Empty earliestSingleDate - Day Month Year Format
                                          latestDate:         [2017,  6, 10, CE]
 
 
-  "after 13 april 1995":                # beforeAfterDate - Empty latestDate calculated as current date - Day Month Year Format
+  "after 13 april 1995":                 # beforeAfterDate - Empty latestDate calculated as current date - Day Month Year Format
                                          earliestSingleDate: [1995,  4, 13, CE]
                                          latestDate:         "current date"
 
-  "after april 13 1995":                # beforeAfterDate - Empty latestDate calculated as current date -  Month Day Year Format
+  "after april 13 1995":                 # beforeAfterDate - Empty latestDate calculated as current date -  Month Day Year Format
                                          earliestSingleDate: [1995,  4, 13, CE]
                                          latestDate:         "current date"
   
-  "10/2005-12/2006":                    # Month/Year - Month/Year date
+  "10/2005-12/2006":                     # Month/Year - Month/Year date
                                          earliestSingleDate: [2005,  10, 1, CE]
                                          latestDate:         [2006,  12, 31, CE]
 
-  "04/1995-04/2018":                    # Month/Year - Month/Year date
+  "04/1995-04/2018":                     # Month/Year - Month/Year date
                                          earliestSingleDate: [1995,  4, 1, CE]
                                          latestDate:         [2018,  4, 30, CE]
 
-  "unknown":                            # Unknown date: Should result in empty fields
+  "unknown":                             # Unknown date: Should result in empty fields
                                          earliestSingleDate: []
 
-  "13 april 15":                        # oneDisplayDate - ambiguous day and year, intepreted as year month day
+  "13 april 15":                         # oneDisplayDate - ambiguous day and year, intepreted as year month day
                                          earliestSingleDate: [13,  4, 15, CE]
 
-  "04/5-6/2018":                        # Month/Day - Day/Year date
+  "04/5-6/2018":                         # Month/Day - Day/Year date
                                          earliestSingleDate: [2018,  4, 5, CE]
                                          latestDate:         [2018,  4, 6, CE]
   
-  "04/03-07/09":                        # Ambigious NumDayInMonthRange - should be interpreted as Month/Day - Day/Year date
+  "04/03-07/09":                         # Ambigious NumDayInMonthRange - should be interpreted as Month/Day - Day/Year date
                                          earliestSingleDate: [9,  4, 3, CE]
                                          latestDate:         [9,  4, 7, CE] 
 
   '04/1996-07/09':                       # Semi-ambigious NumDayInMonthRange - should be interpreted as Month/Year - Month/Year date
-                                         earliestSingleDate: [1996,  4,  1, CE]
-                                         latestDate:         [9,     7, 31, CE] 
+                                         earliestSingleDate: [9,     7, 31, CE]
+                                         latestDate:         [1996,  4,  1, CE]
+
+  "1200±50 BP":                          # Uncalibrated date with ± symbol, with CE
+                                         earliestSingleDate: "uncalibrated earliest date"
+                                         latestDate:         "uncalibrated latest date"
+
+  "3100 +/- 150 BP":                     # Uncalibrated date with +/- instead of ± symbol
+                                         earliestSingleDate: "uncalibrated earliest date"
+                                         latestDate:         "uncalibrated latest date"
+
+  "3100+/-150 BP":                       # Uncalibrated date with +/- instead of ± symbol, no spaces
+                                         earliestSingleDate: "uncalibrated earliest date"
+                                         latestDate:         "uncalibrated latest date"
+
+  "3100+/-150 years BP":                 # Uncalibrated date with 'years' in it
+                                         earliestSingleDate: "uncalibrated earliest date"
+                                         latestDate:         "uncalibrated latest date"
+
+  "3,100+/-150 years BP":                # Uncalibrated date with 'years' in it as well as with a comma
+                                         earliestSingleDate: "uncalibrated earliest date"
+                                         latestDate:         "uncalibrated latest date"
+
+  "2000±100 BP":                         # Uncalibrated date with BCE and AD mix
+                                         earliestSingleDate: "uncalibrated earliest date"
+                                         latestDate:         "uncalibrated latest date"
+
+  "5580-5460 BC":                        # Calibrated date with commas
+                                         earliestSingleDate: [5580,  1, 1, BCE]
+                                         latestDate:         [5460,  12, 31, BCE]
+
+  "5,580 - 5,460 BC":                    # Calibrated date with commas and spaces
+                                         earliestSingleDate: [5580,  1, 1, BCE]
+                                         latestDate:         [5460,  12, 31, BCE]
+
+  "5460-5580 BC":                        # Calibrated date with dates reversed
+                                         earliestSingleDate: [5580,  1, 1, BCE]
+                                         latestDate:         [5460,  12, 31, BCE]
+
+  "c. 69 BC":                            # Circa date, ± 10 years
+                                         earliestSingleDate: [74, 1, 1, BCE]
+                                         latestDate:         [64, 12, 31, BCE]
+
+  "ca. 60 BC":                           # Circa date, ± 5 years
+                                         earliestSingleDate: [70, 1, 1, BCE]
+                                         latestDate:         [50, 12, 31, BCE]
+
+  "circa 200 BC":                        # Circa date, ± 50 years
+                                         earliestSingleDate: [250, 1, 1, BCE]
+                                         latestDate:         [150, 12, 31, BCE]
+
+  "circa 1000 BC":                       # Circa date, ± 500 years
+                                         earliestSingleDate: [1500, 1, 1, BCE]
+                                         latestDate:         [500, 12, 31, BCE]
+
+  '5/13/54,962 BC-4/5/2,019':            # hyphenatedRange, date with comma'd numbers
+                                         earliestSingleDate: [54962,  5,  13, BCE]
+                                         latestDate:         [2019,  4,  5, CE]
 # -------------------------------------------------------------------------------------------------------
 # Invalid dates
 # -------------------------------------------------------------------------------------------------------