From 23732b541e6d63fbd8de9f23b72116e882b03d42 Mon Sep 17 00:00:00 2001 From: Richard Millet Date: Tue, 26 Feb 2019 12:01:59 -0800 Subject: [PATCH] HM-19:cesarvh: Improved BC date range precision, added carbon dating support. --- .../structureddate/antlr/StructuredDate.g4 | 11 ++- .../services/structureddate/DateUtils.java | 43 ++++++++- .../antlr/ANTLRStructuredDateEvaluator.java | 65 ++++++++++++-- .../StructuredDateEvaluatorTest.java | 73 +++++++++++++-- .../src/test/resources/test-dates.yaml | 88 +++++++++++++++---- 5 files changed, 241 insertions(+), 39 deletions(-) diff --git a/services/structureddate/structureddate/src/main/antlr4/org/collectionspace/services/structureddate/antlr/StructuredDate.g4 b/services/structureddate/structureddate/src/main/antlr4/org/collectionspace/services/structureddate/antlr/StructuredDate.g4 index 5bd8e4f57..fca4433b6 100644 --- a/services/structureddate/structureddate/src/main/antlr4/org/collectionspace/services/structureddate/antlr/StructuredDate.g4 +++ b/services/structureddate/structureddate/src/main/antlr4/org/collectionspace/services/structureddate/antlr/StructuredDate.g4 @@ -15,6 +15,7 @@ displayDate: uncertainDate | certainDate | beforeOrAfterDate | unknownDate +| uncalibratedDate ; uncertainDate: CIRCA certainDate ; @@ -25,6 +26,8 @@ certainDate: hyphenatedRange beforeOrAfterDate: ( BEFORE | AFTER ) singleInterval ; +uncalibratedDate: numYear PLUSMINUS num YEARSSTRING? BP ; + hyphenatedRange: singleInterval ( HYPHEN | DASH ) singleInterval | nthCenturyRange | monthInYearRange @@ -134,8 +137,8 @@ unknownDate: UNKNOWN ; /* * Lexer rules */ - WS: [ \t\r\n]+ -> skip; +PLUSMINUS: '±' | '+/-' ; CIRCA: ('c' | 'ca') DOT? | 'circa' ; SPRING: 'spring' | 'spr' ; SUMMER: 'summer' | 'sum' ; @@ -158,11 +161,12 @@ MILLENNIUM: 'millennium' ; MONTH: 'january' | 'february' | 'march' | 'april' | 'may' | 'june' | 'july' | 'august' | 'september' | 'october' | 'november' | 'december' ; SHORTMONTH: 'jan' | 'feb' | 'mar' | 'apr' | 'jun' | 'jul' | 'aug' | 'sep' | 'sept' | 'oct' | 'nov' | 'dec' ; BC: 'bc' | 'bce' | 'b.c.' | 'b.c.e.' ; -AD: 'ad' | 'a.d.' | 'ce' | 'c.e.'; +AD: 'ad' | 'a.d.' | 'ce' | 'c.e.' ; +BP: 'bp' | 'b.p.' | 'b.p' ; NTHSTR: [0-9]*? ([0456789] 'th' | '1st' | '2nd' | '3rd' | '11th' | '12th' | '13th') ; HUNDREDS: [0-9]*? '00' '\''? 's'; TENS: [0-9]*? '0' '\''? 's'; -NUMBER: [0-9]+ ; +NUMBER: ([0-9,]+)*[0-9] ; COMMA: ',' ; HYPHEN: '-' ; DASH: [—–] ; /* EM DASH, EN DASH */ @@ -171,4 +175,5 @@ DOT: '.' ; QUESTION: '?' ; OTHER: . ; UNKNOWN: 'unknown' ; +YEARSSTRING: 'years' | 'year' ; STRING: [a-z]+ ; diff --git a/services/structureddate/structureddate/src/main/java/org/collectionspace/services/structureddate/DateUtils.java b/services/structureddate/structureddate/src/main/java/org/collectionspace/services/structureddate/DateUtils.java index 67c1abc0d..e3b6687a8 100644 --- a/services/structureddate/structureddate/src/main/java/org/collectionspace/services/structureddate/DateUtils.java +++ b/services/structureddate/structureddate/src/main/java/org/collectionspace/services/structureddate/DateUtils.java @@ -1120,16 +1120,33 @@ public class DateUtils { return currentDate; } - MutableDateTime currentDateTime = convertToDateTime(currentDate); - MutableDateTime endDateTime = convertToDateTime(endDate); - - int comparisonResult = currentDateTime.compareTo(endDateTime); + int comparisonResult = compareDates(currentDate, endDate); if (comparisonResult == 1 || comparisonResult == 0) { return currentDate; } return null; } + /** + * Wrapper function for MutableDateTime's comparator. + * @param startDate The first date in the range + * @param endDate The last date in the range + * @return -1 if startDate is before, 0 if they are equal, 1 if startDate is after endDate + */ + public static int compareDates(Date startDate, Date endDate) { + if (startDate.getYear() == null || endDate.getYear() == null) { + throw new IllegalArgumentException("Must provide a start and end date to compare."); + } + + MutableDateTime startDateTime = convertToDateTime(startDate); + MutableDateTime endDateTime = convertToDateTime(endDate); + + return startDateTime.compareTo(endDateTime); + } + + /** + * Returns a Date object based on the local date. + */ public static Date getCurrentDate() { LocalDate localDate = new LocalDate(); Integer year = (Integer) localDate.getYear(); @@ -1190,6 +1207,22 @@ public class DateUtils { if (era == null) { era = Date.DEFAULT_ERA; } + + if (era == Era.BCE) { + // Improved precision for BC dates + int interval = 0; + + if (year % 1000 == 0) { + interval = 500; + } else if (year % 100 == 0) { + interval = 50; + } else if (year % 10 == 0) { + interval = 10; + } else if (year % 10 > 0 && year % 10 < 10) { + interval = 5; + } + return interval; + } MutableDateTime dateTime = new MutableDateTime(chronology); dateTime.era().set((era == Era.BCE) ? DateTimeConstants.BC : DateTimeConstants.AD); @@ -1200,6 +1233,8 @@ public class DateUtils { int years = Years.yearsBetween(dateTime, circaBaseDateTime).getYears(); + + // return interval; return ((int) Math.round(years * 0.05)); } diff --git a/services/structureddate/structureddate/src/main/java/org/collectionspace/services/structureddate/antlr/ANTLRStructuredDateEvaluator.java b/services/structureddate/structureddate/src/main/java/org/collectionspace/services/structureddate/antlr/ANTLRStructuredDateEvaluator.java index 5225d75d3..8fe852efc 100644 --- a/services/structureddate/structureddate/src/main/java/org/collectionspace/services/structureddate/antlr/ANTLRStructuredDateEvaluator.java +++ b/services/structureddate/structureddate/src/main/java/org/collectionspace/services/structureddate/antlr/ANTLRStructuredDateEvaluator.java @@ -1,5 +1,7 @@ package org.collectionspace.services.structureddate.antlr; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import java.util.Stack; import org.antlr.v4.runtime.ANTLRInputStream; @@ -85,6 +87,7 @@ import org.collectionspace.services.structureddate.antlr.StructuredDateParser.St import org.collectionspace.services.structureddate.antlr.StructuredDateParser.StrMonthContext; import org.collectionspace.services.structureddate.antlr.StructuredDateParser.StrSeasonContext; import org.collectionspace.services.structureddate.antlr.StructuredDateParser.StrSeasonInYearRangeContext; +import org.collectionspace.services.structureddate.antlr.StructuredDateParser.UncalibratedDateContext; import org.collectionspace.services.structureddate.antlr.StructuredDateParser.UncertainDateContext; import org.collectionspace.services.structureddate.antlr.StructuredDateParser.UnknownDateContext; import org.collectionspace.services.structureddate.antlr.StructuredDateParser.YearContext; @@ -118,8 +121,7 @@ public class ANTLRStructuredDateEvaluator extends StructuredDateBaseListener imp result = new StructuredDateInternal(); result.setDisplayDate(displayDate); - // Instantiate a parser from the lowercased display date, so that parsing will be - // case insensitive. + // Instantiate a parser from the lowercased display date, so that parsing will be case insensitive ANTLRInputStream inputStream = new ANTLRInputStream(displayDate.toLowerCase()); StructuredDateLexer lexer = new StructuredDateLexer(inputStream); CommonTokenStream tokenStream = new CommonTokenStream(lexer); @@ -157,6 +159,27 @@ public class ANTLRStructuredDateEvaluator extends StructuredDateBaseListener imp Date latestDate = (Date) stack.pop(); Date earliestDate = (Date) stack.pop(); + if (earliestDate.getYear() != null || earliestDate.getYear() != null) { + int compareResult = DateUtils.compareDates(earliestDate, latestDate); + if (compareResult == 1) { + Date temp; + temp = earliestDate; + earliestDate = latestDate; + latestDate = temp; + + // Check to see if the dates were reversed AND calculated. If they were + // Then this probably means the absolute earliestDate should have month and day as "1" + // and the latestDate momth 12, day 31. + if ((earliestDate.getMonth() == 12 && earliestDate.getDay() == 31) && + (latestDate.getMonth() == 1 && latestDate.getDay() == 1)) { + earliestDate.setMonth(1); + earliestDate.setDay(1); + latestDate.setMonth(12); + latestDate.setDay(31); + } + } + } + // If the earliest date and the latest date are the same, it's just a "single" date. // There's no need to have the latest, so set it to null. @@ -217,16 +240,17 @@ public class ANTLRStructuredDateEvaluator extends StructuredDateBaseListener imp Date latestDate = (Date) stack.pop(); Date earliestDate = (Date) stack.pop(); + int earliestInterval = DateUtils.getCircaIntervalYears(earliestDate.getYear(), earliestDate.getEra()); int latestInterval = DateUtils.getCircaIntervalYears(latestDate.getYear(), latestDate.getEra()); - // Express the circa interval as a qualifier. - - // stack.push(earliestDate.withQualifier(QualifierType.MINUS, earliestInterval, QualifierUnit.YEARS)); - // stack.push(latestDate.withQualifier(QualifierType.PLUS, latestInterval, QualifierUnit.YEARS)); + // Express the circa interval as a qualifier. - // OR: + // stack.push(earliestDate.withQualifier(QualifierType.MINUS, earliestInterval, QualifierUnit.YEARS)); + // stack.push(latestDate.withQualifier(QualifierType.PLUS, latestInterval, QualifierUnit.YEARS)); + // OR: + // Express the circa interval as an offset calculated into the year. DateUtils.subtractYears(earliestDate, earliestInterval); @@ -930,7 +954,7 @@ public class ANTLRStructuredDateEvaluator extends StructuredDateBaseListener imp // Convert the string to a number, // and push on the stack. - Integer year = new Integer(ctx.NUMBER().getText()); + Integer year = new Integer(ctx.getText().replaceAll(",", "")); if (year == 0) { throw new StructuredDateFormatException("unexpected year '" + ctx.NUMBER().getText() + "'"); @@ -1177,7 +1201,7 @@ public class ANTLRStructuredDateEvaluator extends StructuredDateBaseListener imp // Convert the numeric string to an Integer, // and push on the stack. - Integer num = new Integer(ctx.NUMBER().getText()); + Integer num = new Integer(ctx.getText().replaceAll(",", "")); stack.push(num); } @@ -1191,6 +1215,29 @@ public class ANTLRStructuredDateEvaluator extends StructuredDateBaseListener imp stack.push(new Date()); } + public void exitUncalibratedDate(UncalibratedDateContext ctx) { + if (ctx.exception != null) return; + + Integer adjustmentDate = (Integer) stack.pop(); + Integer mainYear = (Integer) stack.pop(); + + Integer upperBound = mainYear + adjustmentDate; + Integer lowerBound = mainYear - adjustmentDate; + + Integer currentYear = DateUtils.getCurrentDate().getYear(); + + Integer earliestYear = currentYear - upperBound; + Integer latestYear = currentYear - lowerBound ; + + // If negative, then BC, else AD + Era earliestEra = earliestYear < 0 ? Era.BCE : Era.CE; + Era latestEra = latestYear < 0 ? Era.BCE : Era.CE; + + stack.push(new Date(Math.abs(earliestYear), 1, 1, earliestEra)); // Earliest Early Date + stack.push(new Date(Math.abs(latestYear), 12, DateUtils.getDaysInMonth(12, Math.abs(latestYear), latestEra), latestEra)); // Latest Late Date + + } + protected String getErrorMessage(RecognitionException re) { String message = ""; diff --git a/services/structureddate/structureddate/src/test/java/org/collectionspace/services/structureddate/StructuredDateEvaluatorTest.java b/services/structureddate/structureddate/src/test/java/org/collectionspace/services/structureddate/StructuredDateEvaluatorTest.java index 039252b8b..3f1befd7b 100644 --- a/services/structureddate/structureddate/src/test/java/org/collectionspace/services/structureddate/StructuredDateEvaluatorTest.java +++ b/services/structureddate/structureddate/src/test/java/org/collectionspace/services/structureddate/StructuredDateEvaluatorTest.java @@ -7,6 +7,7 @@ import java.util.Arrays; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Stack; import org.apache.commons.beanutils.PropertyUtils; import org.slf4j.Logger; @@ -57,14 +58,19 @@ public class StructuredDateEvaluatorTest { if (structuredDateFields != null && structuredDateFields.containsKey("latestDate")) { Object latestDate = structuredDateFields.get("latestDate"); if (latestDate instanceof String) { + Date currentDate = DateUtils.getCurrentDate(); + ArrayList latestDateItems = new ArrayList<>(); if (latestDate.equals("current date")) { - ArrayList items = new ArrayList<>(); - Date currentDate = DateUtils.getCurrentDate(); - items.add(currentDate.getYear()); - items.add(currentDate.getMonth()); - items.add(currentDate.getDay()); - items.add(currentDate.getEra() == Era.BCE ? "BCE" : "CE"); - structuredDateFields.put("latestDate", items); + latestDateItems.add(currentDate.getYear()); + latestDateItems.add(currentDate.getMonth()); + latestDateItems.add(currentDate.getDay()); + latestDateItems.add(currentDate.getEra() == Era.BCE ? "BCE" : "CE"); + structuredDateFields.put("latestDate", latestDateItems); + } + if (latestDate.equals("uncalibrated latest date")) { + Stack results = calculateUncalibratedDate(displayDate, currentDate.getYear()); + structuredDateFields.put("latestDate", results.pop()); + structuredDateFields.put("earliestSingleDate", results.pop()); } } } @@ -103,6 +109,59 @@ public class StructuredDateEvaluatorTest { return structuredDate; } + + /** + * Calculates the uncalibrated date, since the yalm expected dates need to be dynamic + * as they will change from year to year. + * @param displayDate The current test's display date + * @param currentYear The current year + * + * @return a stack consisting of two ArrayLists, each containing the expected dates + */ + public Stack calculateUncalibratedDate(String displayDate, Integer currentYear) { + Stack stack = new Stack(); + ArrayList latestDate = new ArrayList<>(); + ArrayList earliestDate = new ArrayList<>(); + + + String reg = "±|\\+/-"; + String[] splitDateTokens = displayDate.split(reg); + String[] tokensPartTwo = splitDateTokens[1].split(" "); + + Integer mainYear = Integer.parseInt(splitDateTokens[0].replaceAll("\\s|,", "")); + Integer offset; + + try { + offset = Integer.parseInt(tokensPartTwo[0]); + } catch (Exception e) { + offset = Integer.parseInt(tokensPartTwo[1].replaceAll("\\s|,", "")); + } + + Integer earliestYear = currentYear - (mainYear + offset); + Integer latestYear = currentYear - (mainYear - offset); + + String earliestEra = earliestYear < 0 ? "BCE" : "CE"; + String latestEra = latestYear < 0 ? "BCE" : "CE"; + + earliestYear = Math.abs(earliestYear); + latestYear = Math.abs(latestYear); + + latestDate.add(latestYear); + latestDate.add(12); + latestDate.add(DateUtils.getDaysInMonth(12, latestYear, null)); + latestDate.add(latestEra); + + earliestDate.add(earliestYear); + earliestDate.add(1); + earliestDate.add(1); + earliestDate.add(earliestEra); + + stack.push(earliestDate); + stack.push(latestDate); + + return stack; + } + private Date createDateFromYamlSpec(List dateFields) { Date date = new Date(); Iterator fieldIterator = dateFields.iterator(); diff --git a/services/structureddate/structureddate/src/test/resources/test-dates.yaml b/services/structureddate/structureddate/src/test/resources/test-dates.yaml index 2b7e1925f..a746dbd75 100644 --- a/services/structureddate/structureddate/src/test/resources/test-dates.yaml +++ b/services/structureddate/structureddate/src/test/resources/test-dates.yaml @@ -925,8 +925,8 @@ latestDate: [2013, 4, 5, CE] '5/3/1962-4/5/2013 BC': # hyphenatedRange, date - earliestSingleDate: [1962, 5, 3, BCE] - latestDate: [2013, 4, 5, BCE] + earliestSingleDate: [2013, 4, 5, BCE] + latestDate: [1962, 5, 3, BCE] '5/3/1962 BC-4/5/2013': # hyphenatedRange, date earliestSingleDate: [1962, 5, 3, BCE] @@ -1069,8 +1069,8 @@ # latestDate: [ 10, 12, 31, BCE, null, PLUS, 106, YEARS] 'Circa 10 BC': # uncertainDate, year - calculating the uncertainty into the year field - earliestSingleDate: [ 115, 1, 1, BCE] - latestDate: [ 96, 12, 31, CE] + earliestSingleDate: [ 20, 1, 1, BCE] + latestDate: [ 1, 12, 31, CE] # 'Circa 10': # uncertainDate, year - using qualifier/value/unit fields # earliestSingleDate: [ 10, 1, 1, CE, null, MINUS, 105, YEARS] @@ -1148,11 +1148,11 @@ "13th april, 1995": # oneDisplayDate - singleInterval - dayFirstDate - Day (ordinal) Month Year earliestSingleDate: [1995, 4, 13, CE] - "13th april, 1995 - 5th may 1999": # oneDisplayDate - hyphenatedRange - dayFirstDate - Day (ordinal) Month Year + "13th april, 1995 - 5th may 1999": # oneDisplayDate - hyphenatedRange - dayFirstDate - Day (ordinal) Month Year earliestSingleDate: [1995, 4, 13, CE] latestDate: [1999, 5, 5, CE] - "13 april 15": # oneDisplayDate - ambigous day and year - should be Year month day + "13 april 15": # oneDisplayDate - ambigous day and year - should be Year month day earliestSingleDate: [13, 4, 15, CE] "before 13 april 1995": # beforeAfterDate - Empty earliestSingleDate - Day Month Year Format @@ -1164,39 +1164,95 @@ latestDate: [2017, 6, 10, CE] - "after 13 april 1995": # beforeAfterDate - Empty latestDate calculated as current date - Day Month Year Format + "after 13 april 1995": # beforeAfterDate - Empty latestDate calculated as current date - Day Month Year Format earliestSingleDate: [1995, 4, 13, CE] latestDate: "current date" - "after april 13 1995": # beforeAfterDate - Empty latestDate calculated as current date - Month Day Year Format + "after april 13 1995": # beforeAfterDate - Empty latestDate calculated as current date - Month Day Year Format earliestSingleDate: [1995, 4, 13, CE] latestDate: "current date" - "10/2005-12/2006": # Month/Year - Month/Year date + "10/2005-12/2006": # Month/Year - Month/Year date earliestSingleDate: [2005, 10, 1, CE] latestDate: [2006, 12, 31, CE] - "04/1995-04/2018": # Month/Year - Month/Year date + "04/1995-04/2018": # Month/Year - Month/Year date earliestSingleDate: [1995, 4, 1, CE] latestDate: [2018, 4, 30, CE] - "unknown": # Unknown date: Should result in empty fields + "unknown": # Unknown date: Should result in empty fields earliestSingleDate: [] - "13 april 15": # oneDisplayDate - ambiguous day and year, intepreted as year month day + "13 april 15": # oneDisplayDate - ambiguous day and year, intepreted as year month day earliestSingleDate: [13, 4, 15, CE] - "04/5-6/2018": # Month/Day - Day/Year date + "04/5-6/2018": # Month/Day - Day/Year date earliestSingleDate: [2018, 4, 5, CE] latestDate: [2018, 4, 6, CE] - "04/03-07/09": # Ambigious NumDayInMonthRange - should be interpreted as Month/Day - Day/Year date + "04/03-07/09": # Ambigious NumDayInMonthRange - should be interpreted as Month/Day - Day/Year date earliestSingleDate: [9, 4, 3, CE] latestDate: [9, 4, 7, CE] '04/1996-07/09': # Semi-ambigious NumDayInMonthRange - should be interpreted as Month/Year - Month/Year date - earliestSingleDate: [1996, 4, 1, CE] - latestDate: [9, 7, 31, CE] + earliestSingleDate: [9, 7, 31, CE] + latestDate: [1996, 4, 1, CE] + + "1200±50 BP": # Uncalibrated date with ± symbol, with CE + earliestSingleDate: "uncalibrated earliest date" + latestDate: "uncalibrated latest date" + + "3100 +/- 150 BP": # Uncalibrated date with +/- instead of ± symbol + earliestSingleDate: "uncalibrated earliest date" + latestDate: "uncalibrated latest date" + + "3100+/-150 BP": # Uncalibrated date with +/- instead of ± symbol, no spaces + earliestSingleDate: "uncalibrated earliest date" + latestDate: "uncalibrated latest date" + + "3100+/-150 years BP": # Uncalibrated date with 'years' in it + earliestSingleDate: "uncalibrated earliest date" + latestDate: "uncalibrated latest date" + + "3,100+/-150 years BP": # Uncalibrated date with 'years' in it as well as with a comma + earliestSingleDate: "uncalibrated earliest date" + latestDate: "uncalibrated latest date" + + "2000±100 BP": # Uncalibrated date with BCE and AD mix + earliestSingleDate: "uncalibrated earliest date" + latestDate: "uncalibrated latest date" + + "5580-5460 BC": # Calibrated date with commas + earliestSingleDate: [5580, 1, 1, BCE] + latestDate: [5460, 12, 31, BCE] + + "5,580 - 5,460 BC": # Calibrated date with commas and spaces + earliestSingleDate: [5580, 1, 1, BCE] + latestDate: [5460, 12, 31, BCE] + + "5460-5580 BC": # Calibrated date with dates reversed + earliestSingleDate: [5580, 1, 1, BCE] + latestDate: [5460, 12, 31, BCE] + + "c. 69 BC": # Circa date, ± 10 years + earliestSingleDate: [74, 1, 1, BCE] + latestDate: [64, 12, 31, BCE] + + "ca. 60 BC": # Circa date, ± 5 years + earliestSingleDate: [70, 1, 1, BCE] + latestDate: [50, 12, 31, BCE] + + "circa 200 BC": # Circa date, ± 50 years + earliestSingleDate: [250, 1, 1, BCE] + latestDate: [150, 12, 31, BCE] + + "circa 1000 BC": # Circa date, ± 500 years + earliestSingleDate: [1500, 1, 1, BCE] + latestDate: [500, 12, 31, BCE] + + '5/13/54,962 BC-4/5/2,019': # hyphenatedRange, date with comma'd numbers + earliestSingleDate: [54962, 5, 13, BCE] + latestDate: [2019, 4, 5, CE] # ------------------------------------------------------------------------------------------------------- # Invalid dates # ------------------------------------------------------------------------------------------------------- -- 2.47.3