package edu.stanford.nlp.ie.pascal;
import edu.stanford.nlp.ie.QuantifiableEntityNormalizer;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.StringUtils;
import java.io.BufferedReader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Map;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Represents dates and times according to ISO8601 standard while also allowing for
* wild cards - e.g., can represent "21 June" without a year
* (Standard ISO8601 only allows removing less precise annotations (e.g.,
* 200706 rather than 20070621 but not a way to represent 0621 without a year)
* <p/>
* Format stores date and time separately since the majority of current use
* cases involve only one of these items. Standard ISO 8601 instead
* requires <date>T<time>.
* <p/>
* Ranges are specified within the strings via forward slash. For example
* 6 June - 8 June is represented ****0606/****0608. 6 June onward is
* ****0606/ and until 8 June is /****0608.
*
* @author Anna Rafferty
* TODO: add time support - currently just dates are supported
*/
public class ISODateInstance {
private static final boolean DEBUG = false;
private ArrayList<String> tokens = new ArrayList<String>();//each token contains some piece of the date, from our input.
public static final String OPEN_RANGE_AFTER = "A";
public static final String OPEN_RANGE_BEFORE = "B";
public static final String BOUNDED_RANGE = "C";
public static final String NO_RANGE = "";
public static final int DAY_OF_HALF_MONTH = 15;
public static final int LAST_DAY_OF_MONTH = 31;//close enough for our purposes
public static final String MONTH_OF_HALF_YEAR = "07";
public static final String LAST_MONTH_OF_YEAR = "12";
/**
* String of the format <year><month><day>. Representations
* by week are also allowed. If a more general field (such as year)
* is not specified when a less general one (such as month) is, the characters
* normally filled by the more general field are replaced by asterisks. For example,
* 21 June would be \"****0621\". Less general fields are simply truncated;
* for example, June 2007 would be \"200706\".
*/
private String isoDate = "";
//Variable for marking if we were unable to parse the string associated with this isoDate
private boolean unparseable = false;
//private String isoTime = "";
/**
* Creates an empty date instance; you probably
* don't want this in most cases.
*/
public ISODateInstance() {
}
/**
* Takes a string that represents a date, and attempts to
* normalize it into ISO 8601-compatible format.
*
*/
public ISODateInstance(String date) {
extractFields(date);
}
public ISODateInstance(String date, String openRangeMarker) {
extractFields(date);
//now process the range marker; if a range was found independently, we ignore the marker
if ( ! ISODateInstance.NO_RANGE.equals(openRangeMarker) && ! isoDate.contains("/")) {
if (ISODateInstance.OPEN_RANGE_AFTER.equals(openRangeMarker)) {
isoDate = isoDate + '/';
} else if (ISODateInstance.OPEN_RANGE_BEFORE.equals(openRangeMarker)) {
isoDate = '/' + isoDate;
}
}
}
/**
* Constructor for a range of dates, beginning at date start and finishing at date end
*
*/
public ISODateInstance(ISODateInstance start, ISODateInstance end) {
String startString = start.getDateString();
if (start.isRange()) {
startString = start.getStartDate();
}
String endString = end.getDateString();
if (end.isRange()) {
endString = end.getEndDate();
}
isoDate = startString + '/' + endString;
unparseable = (start.isUnparseable() || end.isUnparseable());
}
/**
* Construct a new ISODate based on its relation to a referenceDate.
* relativeDate should be something like "today" or "tomorrow" or "last year"
* and the resulting ISODate will be the same as the referenceDate, a day later,
* or a year earlier, respectively.
*
*/
public ISODateInstance(ISODateInstance referenceDate, String relativeDate) {
Pair<DateField, Integer> relation = relativeDateMap.get(relativeDate.toLowerCase());
if (relation != null) {
switch (relation.first()) {
case DAY:
incrementDay(referenceDate, relation);
break;
case MONTH:
incrementMonth(referenceDate, relation);
break;
case YEAR:
incrementYear(referenceDate, relation);
break;
}
}
}
private void incrementYear(ISODateInstance referenceDate, Pair<DateField, Integer> relation) {
String origDateString = referenceDate.getStartDate();
String yearString = origDateString.substring(0, 4);
if (yearString.contains("*")) {
isoDate = origDateString;
return;
}
isoDate = makeStringYearChange(origDateString, Integer.parseInt(yearString) + relation.second());
}
private void incrementMonth(ISODateInstance referenceDate, Pair<DateField, Integer> relation) {
String origDateString = referenceDate.getStartDate();
String monthString = origDateString.substring(4, 6);
if (monthString.contains("*")) {
isoDate = origDateString;
return;
}
//Month is not a variable
Integer monthNum = Integer.parseInt(monthString);
//Check if we're an edge case
if (((monthNum + relation.second()) > 12) || ((monthNum + relation.second) < 1)) {
boolean decreasing = ((monthNum + relation.second) < 1);
int newMonthNum = (monthNum + relation.second()) % 12;
if (newMonthNum < 0) {
newMonthNum *= -1;
}
//Set the month appropriately
isoDate = makeStringMonthChange(origDateString, newMonthNum);
//Increment the year if possible
String yearString = origDateString.substring(0, 4);
if (!yearString.contains("*")) {
//How much we increment depends on above mod
int numYearsToIncrement = (int) Math.ceil(relation.second() / 12.0);
if (decreasing) {
isoDate = makeStringYearChange(isoDate, Integer.parseInt(yearString) - numYearsToIncrement);
} else {
isoDate = makeStringYearChange(isoDate, Integer.parseInt(yearString) + numYearsToIncrement);
}
}
} else {
isoDate = makeStringMonthChange(origDateString, (monthNum + relation.second()));
}
}
private void incrementDay(ISODateInstance referenceDate, Pair<DateField, Integer> relation) {
String origDateString = referenceDate.getStartDate();
String dayString = origDateString.substring(origDateString.length() - 2, origDateString.length());
if (dayString.contains("*")) {
isoDate = origDateString;
return;
}
//Date is not a variable
Integer dayNum = Integer.parseInt(dayString);
String monthString = origDateString.substring(origDateString.length() - 4, origDateString.length() - 2);
int numDaysInMonth = 30;//default - assume this if month is a variable
int monthNum = -1;//ie, we don't know the month yet - this remains -1 if the month is a variable
if (!monthString.contains("*")) {
//Set appropriate numDaysInMonth and monthNum
monthNum = Integer.parseInt(monthString);
numDaysInMonth = daysPerMonth.get(monthNum);
}
//Now, find out if we're an edge case (potential to increment month)
if (dayNum + relation.second() <= numDaysInMonth && dayNum + relation.second() >= 1) {
//Not an edge case - just increment the day, create a new string, and return
dayNum += relation.second();
isoDate = makeStringDayChange(origDateString, dayNum);
return;
}
//Since we're an edge case, the month can't be a variable - if it is a variable, just set this to the reference string
if (monthNum == -1) {
isoDate = origDateString;
return;
}
//At this point, neither our day nor our month is a variable
isoDate = origDateString;
boolean decreasing = (dayNum + relation.second() < 1);
//Need to increment the month, set the date appropriately - we need the new month num to set the day appropriately, so do month first
int newMonthNum;
//Now, check if we're an edge case for month
if ((monthNum + 1 > 12 && !decreasing) || (monthNum - 1 < 1 && decreasing)) {
//First, change the month
if (decreasing) {
newMonthNum = 12;
} else {
newMonthNum = 1;
}
//If we can, increment the year
//TODO: fix this to work more nicely with variables and thus handle more cases
String yearString = origDateString.substring(0, 4);
if (!yearString.contains("*")) {
if (decreasing) {
isoDate = makeStringYearChange(isoDate, Integer.parseInt(yearString) - 1);
} else {
isoDate = makeStringYearChange(isoDate, Integer.parseInt(yearString) + 1);
}
}
} else {
//We're not an edge case for month - just increment
if (decreasing) {
newMonthNum = monthNum - 1;
} else {
newMonthNum = monthNum + 1;
}
}
//do the increment
isoDate = makeStringMonthChange(isoDate, newMonthNum);
int newDateNum;
if (decreasing) {
newDateNum = -relation.second() + daysPerMonth.get(newMonthNum) - dayNum;
} else {
newDateNum = relation.second() - dayNum + daysPerMonth.get(monthNum);
}
//Now, change the day in our original string to be appropriate
isoDate = makeStringDayChange(isoDate, newDateNum);
}
/**
* Changes the day portion of the origDate String to be the String
* value of newDay in two character format. (e.g., 9 -> "09")
*
*/
private static String makeStringDayChange(String origDate, int newDay) {
String newDayString = (newDay < 10 ? ("0" + newDay) : String.valueOf(newDay));
return origDate.substring(0, origDate.length() - 2) + newDayString;
}
/**
* Changes the month portion of the origDate String to be the String
* value of newDay in two character format. (e.g., 9 -> "09")
*
*/
private static String makeStringMonthChange(String origDate, int newMonth) {
String newMonthString = (newMonth < 10 ? ("0" + newMonth) : String.valueOf(newMonth));
return origDate.substring(0, 4) + newMonthString + origDate.substring(6, 8);
}
/**
* Changes the year portion of the origDate String to be the String
* value of newDay in two character format. (e.g., 9 -> "09")
*
*/
private static String makeStringYearChange(String origDate, int newYear) {
String newYearString = String.valueOf(newYear);
while (newYearString.length() < 4) {
newYearString = '0' + newYearString;//we're compatible with year 1!
}
return newYearString + origDate.substring(4, origDate.length());
}
/**
* Enum for the fields *
*/
public static enum DateField {
DAY, MONTH, YEAR
}
/**
* Map for mapping a relativeDate String to a pair with the field that should be modified and the amount to modify it *
*/
public static final Map<String, Pair<DateField, Integer>> relativeDateMap = Generics.newHashMap();
static {
//Add entries to the relative datemap
relativeDateMap.put("today", new Pair<DateField, Integer>(DateField.DAY, 0));
relativeDateMap.put("tomorrow", new Pair<DateField, Integer>(DateField.DAY, 1));
relativeDateMap.put("yesterday", new Pair<DateField, Integer>(DateField.DAY, -1));
}
public static final Map<Integer, Integer> daysPerMonth = Generics.newHashMap();
static {
//Add month entries
daysPerMonth.put(1, 31);
daysPerMonth.put(2, 28);
daysPerMonth.put(3, 31);
daysPerMonth.put(4, 30);
daysPerMonth.put(5, 31);
daysPerMonth.put(6, 30);
daysPerMonth.put(7, 31);
daysPerMonth.put(8, 31);
daysPerMonth.put(9, 30);
daysPerMonth.put(10, 31);
daysPerMonth.put(11, 30);
daysPerMonth.put(12, 31);
}
/**
* Takes a string already formatted in ISODateInstance format
* (such as one previously written out using toString) and creates
* a new date instance from it
*
*/
public static ISODateInstance fromDateString(String date) {
ISODateInstance d = new ISODateInstance();
d.isoDate = date;
return d;
}
public String toString() {
return isoDate;
}
/**
* Provided for backwards compatibility with DateInstance;
* returns the same thing as toString()
*
*/
public String getDateString() {
return this.toString();
}
/**
* Uses regexp matching to match month, day, and year fields
* TODO: Find a way to mark what;s already been handled in the string
*/
public boolean extractFields(String inputDate) {
if (tokens.size() < 2) {
tokenizeDate(inputDate);
}
if (DEBUG) {
System.err.println("Extracting date: " + inputDate);
}
//first we see if it's a hyphen and two parseable dates - if not, we treat it as one date
Pair<String, String> dateEndpoints = getRangeDates(inputDate);
if (dateEndpoints != null) {
ISODateInstance date1 = new ISODateInstance(dateEndpoints.first());
if (dateEndpoints.first().contains(" ") && !dateEndpoints.second().contains(" ")) {
//consider whether it's a leading modifier; e.g., "June 8-10" will be split into June 8, and 10 when really we'd like June 8 and June 10
String date = dateEndpoints.first().substring(0, dateEndpoints.first().indexOf(' ')) + ' ' + dateEndpoints.second();
ISODateInstance date2 = new ISODateInstance(date);
if (!date1.isUnparseable() && !date2.isUnparseable()) {
isoDate = (new ISODateInstance(date1, date2)).getDateString();
return true;
}
}
ISODateInstance date2 = new ISODateInstance(dateEndpoints.second());
if (!date1.isUnparseable() && !date2.isUnparseable()) {
isoDate = (new ISODateInstance(date1, date2)).getDateString();
return true;
}
}
if (extractYYYYMMDD(inputDate)) {
return true;
}
if (extractMMDDYY(inputDate)) {
return true;
}
boolean passed = false;
passed = extractYear(inputDate) || passed;
passed = extractMonth(inputDate) || passed;
passed = extractDay(inputDate) || passed;
//slightly hacky, but check for some common modifiers that get grouped into the date
passed = addExtraRanges(inputDate) || passed;
if (!passed) {//couldn't parse
//try one more trick
unparseable = true;
boolean weekday = extractWeekday(inputDate);
if (!weekday) {
isoDate = inputDate;
}
}
return passed;
}
private static String[] rangeIndicators = {"--", "-"};
/**
* Attempts to find the two sides of a range in the given string.
* Uses rangeIndicators to find possible matches.
*
*/
private static Pair<String, String> getRangeDates(String inputDate) {
for (String curIndicator : rangeIndicators) {
String[] dates = inputDate.split(curIndicator);
if (dates.length == 2) {
return new Pair<String, String>(dates[0], dates[1]);
}
}
return null;
}
private boolean addExtraRanges(String inputDate) {
if (isRange()) {
return false;
}
inputDate = inputDate.toLowerCase();
if (inputDate.contains("half")) {
if (inputDate.contains("first") && isoDate.length() <= 6) {
String firstDate = isoDate + "01";
String secondDate;
if (isoDate.length() == 4) {//year
secondDate = isoDate + MONTH_OF_HALF_YEAR;
} else {//month
secondDate = isoDate + DAY_OF_HALF_MONTH;
}
isoDate = firstDate + '/' + secondDate;
return true;
} else if (inputDate.contains("second") && isoDate.length() <= 6) {
String firstDate;
String secondDate;
if (isoDate.length() == 4) {//year
firstDate = isoDate + MONTH_OF_HALF_YEAR;
secondDate = isoDate + LAST_MONTH_OF_YEAR;
isoDate = firstDate + '/' + secondDate;
} else {//month
firstDate = isoDate + DAY_OF_HALF_MONTH;
secondDate = isoDate + LAST_DAY_OF_MONTH;
}
isoDate = firstDate + '/' + secondDate;
return true;
}
}
return false;
}
/**
* Returns true iff this date represents a range
* The range must have at least a start or end
* date, but is not guaranteed to have both
*
* @return Whether this date represents a range
*/
public boolean isRange() {
if (unparseable) {
return false;
}
return isoDate.matches("/");
}
/**
* Returns true iff we were unable to parse the input
* String associated with this date; in that case,
* we just store the input string and shortcircuit
* all of the comparison methods
*
*/
public boolean isUnparseable() {
return unparseable;
}
/**
* Returns this date or if it is a range,
* the date the range starts. If the date
* is of the form /<date>, "" is returned
*
* @return Start date of range
*/
public String getStartDate() {
if (!isRange()) {
return isoDate;
}
if (isoDate.startsWith("/")) {
return "";
}
return isoDate.split("/")[0];
}
/**
* Returns this date or if it is a range,
* the date the range ends. If the date
* is of the form <date>/, "" is returned
*
* @return End date of range
*/
public String getEndDate() {
if (!isRange()) {
return isoDate;
}
if (isoDate.endsWith("/")) {
return "";
}
String[] split = isoDate.split("/");
return split[split.length - 1];
}
/* -------------------------- Static Comparison Methods -------------------------- */
/**
* Returns true if date1 is after date2
* <p/>
* Several tricky cases exist, and implementation tries to
* go with the common sense interpretation:
* When a year and a month are given for one, but only a month
* for the other, it is assumed that both have the same year
* e.g:
* ****12 is after 200211
* <p/>
* When a year and a month are given for one but only a year
* for the other, it is assumed that one of these is after the
* other only if the years differ, e.g.:
* 2003 is after 200211
* 2002 is not after 200211
* 200211 is not after 2002
*
* @return Whether date2 is after date1
*/
static boolean isAfter(String date1, String date2) {
if (!isDateFormat(date1) || !isDateFormat(date2)) {
return false;
}
boolean after = true;
//first check years
String year = date1.substring(0, 4);
String yearOther = date2.substring(0, 4);
if (year.contains("*") || yearOther.contains("*")) {
after = after && checkWildcardCompatibility(year, yearOther);
} else if (Integer.parseInt(year) > Integer.parseInt(yearOther)) {
return true;
} else if (Integer.parseInt(year) < Integer.parseInt(yearOther)) {
return false;
}
if (date1.length() < 6 || date2.length() < 6) {
if (year.contains("*") || yearOther.contains("*")) {
return after;
} else {
return after && (Integer.parseInt(year) != Integer.parseInt(yearOther));
}
}
//then check months
String month = date1.substring(4, 6);
String monthOther = date2.substring(4, 6);
if (month.contains("*") || monthOther.contains("*")) {
after = after && checkWildcardCompatibility(month, monthOther);
} else if (Integer.parseInt(month) > Integer.parseInt(monthOther)) {
return true;
} else if (Integer.parseInt(month) < Integer.parseInt(monthOther)) {
return false;
}
if (date1.length() < 8 || date2.length() < 8) {
if (month.contains("*") || monthOther.contains("*")) {
return after;
} else {
return after && (Integer.parseInt(month) != Integer.parseInt(monthOther));
}
}
//then check days
String day = date1.substring(6, 8);
String dayOther = date2.substring(6, 8);
if (day.contains("*") || dayOther.contains("*")) {
after = after && checkWildcardCompatibility(day, dayOther);
} else if (Integer.parseInt(day) > Integer.parseInt(dayOther)) {
return true;
} else if (Integer.parseInt(day) <= Integer.parseInt(dayOther)) {
return false;
}
return after;
}
/**
* Right now, we say they're compatible iff one of them is all
* wildcards or they are equivalent
*
*/
@SuppressWarnings("unused")
private static boolean checkWildcardAfterCompatibility(String txt1, String txt2) {
if (txt1.length() != txt2.length()) {
return false;
}
for (int i = 0; i < txt1.length(); i++) {
Character t1 = txt1.charAt(i);
Character t2 = txt2.charAt(i);
if (!(t1.equals('*') || t2.equals('*') || t1.equals(t2))) {
return false;
}
}
return true;
}
/**
* Returns true if the given txt contains only digits and "*" characters;
* false otherwise
*
*/
private static boolean isDateFormat(String txt) {
String numberValue = txt.replace("*", "");//remove wildcards
try {
Integer.parseInt(numberValue);
return true;
} catch (Exception e) {
return false;
}
}
/**
* Returns true iff date1 could represent the same value as date2
* e.g.
* ****07 is compatible with 200207 (and 200207 is compatible with ****07)
* 200207 is compatible with 20020714 (?maybe need a better idea of use case here...)
*
*/
public static boolean isCompatible(String date1, String date2) {
boolean compatible = true;
//first check years
compatible = compatible && isYearCompatible(date1, date2);
//then check months
compatible = compatible && isMonthCompatible(date1, date2);
//then check days
compatible = compatible && isDayCompatible(date1, date2);
return compatible;
}
/**
* Checks if the years represented by the two dates are compatible
* If either lacks a year, we return true.
*
*/
private static boolean isYearCompatible(String date1, String date2) {
boolean compatible = true;
if (date1.length() < 4 || date2.length() < 4) {
return compatible;
}
//first check years
String year = date1.substring(0, 4);
String yearOther = date2.substring(0, 4);
if (year.contains("*") || yearOther.contains("*")) {
compatible = compatible && checkWildcardCompatibility(year, yearOther);
} else if (!year.equals(yearOther)) {
return false;
}
return compatible;
}
/**
* Checks if the months represented by the two dates are compatible
* If either lacks a month, we return true.
*
*/
private static boolean isMonthCompatible(String date1, String date2) {
boolean compatible = true;
if (date1.length() < 6 || date2.length() < 6) {
return compatible;
}
//then check months
String month = date1.substring(4, 6);
String monthOther = date2.substring(4, 6);
if (month.contains("*") || monthOther.contains("*")) {
compatible = (compatible && checkWildcardCompatibility(month, monthOther));
} else if (!month.equals(monthOther)) {
return false;
}
return compatible;
}
/**
* Checks if the days represented by the two dates are compatible
* If either lacks a day, we return true.
*
*/
private static boolean isDayCompatible(String date1, String date2) {
boolean compatible = true;
if (date1.length() < 8 || date2.length() < 8) {
return compatible;
}
//then check days
String day = date1.substring(6, 8);
String dayOther = date2.substring(6, 8);
if (day.contains("*") || dayOther.contains("*")) {
compatible = compatible && checkWildcardCompatibility(day, dayOther);
} else if (!day.equals(dayOther)) {
return false;
}
return compatible;
}
/**
*/
private static boolean checkWildcardCompatibility(String txt1, String txt2) {
if (txt1.length() != txt2.length()) {
return false;
}
for (int i = 0; i < txt1.length(); i++) {
Character t1 = txt1.charAt(i);
Character t2 = txt2.charAt(i);
if (!(t1.equals('*') || t2.equals('*') || t1.equals(t2))) {
return false;
}
}
return true;
}
/* -------------------------- Instance Comparison Methods -------------------------- */
/**
* Returns true iff this date
* contains the date represented by other.
* A range contains a date if it
* is equal to or after the start date and equal to or
* before the end date. For open ranges, contains
* is also inclusive of the one end point.
*
*/
public boolean contains(ISODateInstance other) {
if (this.isUnparseable() || other.isUnparseable()) {
return this.isoDate.equals(other.isoDate);
}
String start = this.getStartDate();
if (!start.equals("")) {//we have a start date, need to make sure other is after it
String startOther = other.getStartDate();
if (startOther.equals("")) {
return false;//incompatible
} else {
if (!isAfter(startOther, start)) {
return false;
}
}
}
//now we've found out that the start date is appropriate, check the end date
String end = this.getEndDate();
if (!end.equals("")) {
String endOther = other.getEndDate();
if (endOther.equals("")) {
return false;
} else {
if (!isAfter(end, endOther)) {
return false;
}
}
}
return true;//passes both start and end
}
/**
* Returns true if this date instance is after
* the given dateString. If this date instance
* is a range, then returns true only if both
* start and end dates are after dateString.
* <p/>
* Several tricky cases exist, and implementation tries to
* go with the commonsense interpretation:
* When a year and a month are given for one, but only a month
* for the other, it is assumed that both have the same year
* e.g:
* ****12 is after 200211
* <p/>
* When a year and a month are given for one but only a year
* for the other, it is assumed that one of these is after the
* other only if the years differ, e.g.:
* 2003 is after 200211
* 2002 is not after 200211
* 200211 is not after 2002
*
*/
public boolean isAfter(String dateString) {
if (this.isUnparseable()) {
return false;
}
if (!isDateFormat(dateString)) {
return false;
}
return isAfter(this.getEndDate(), dateString);
}
public boolean isCompatibleDate(ISODateInstance other) {
if (this.isUnparseable() || other.isUnparseable()) {
return this.isoDate.equals(other.isoDate);
}
//first see if either is a range
if (this.isRange()) {
return this.contains(other);
} else if (other.isRange()) {
return false;//not compatible if other is range and this isn't
} else {
return isCompatible(isoDate, other.getDateString());
}
}
/**
* Looks if the years for the two dates are compatible.
* This method does not consider ranges and uses only the
* start date.
*
*/
public boolean isYearCompatible(ISODateInstance other) {
if (this.isUnparseable() || other.isUnparseable()) {
return this.isoDate.equals(other.isoDate);
}
return isYearCompatible(isoDate, other.getDateString());
}
/**
* Looks if the months for the two dates are compatible.
* This method does not consider ranges and uses only the
* start date.
*
*/
public boolean isMonthCompatible(ISODateInstance other) {
if (this.isUnparseable() || other.isUnparseable()) {
return this.isoDate.equals(other.isoDate);
}
return isMonthCompatible(isoDate, other.getDateString());
}
/**
* Looks if the days for the two dates are compatible.
* This method does not consider ranges and uses only the
* start date.
*
*/
public boolean isDayCompatible(ISODateInstance other) {
if (this.isUnparseable() || other.isUnparseable()) {
return this.isoDate.equals(other.isoDate);
}
return isDayCompatible(isoDate, other.getDateString());
}
/* -------------------------- Tokenization and Field Extraction -------------------------- */
//These methods are taken directly from or modified slightly from {@link DateInstance}
private void tokenizeDate(String inputDate) {
tokens = new ArrayList<String>();
Pattern pat = Pattern.compile("[-]");
if (inputDate == null) {
System.out.println("Null input date");
}
Matcher m = pat.matcher(inputDate);
String str = m.replaceAll(" - ");
str = str.replaceAll(",", " ");
PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(new BufferedReader(new StringReader(str)));
while (tokenizer.hasNext()) {
Word nextToken = tokenizer.next();
tokens.add(nextToken.toString());
}
if(DEBUG) {
System.out.println("tokens:" + tokens);
}
}
/**
* This method does YYYY-MM-DD style ISO date formats
*
* @return whether it worked.
*/
private boolean extractYYYYMMDD(String inputDate) {
Pattern pat = Pattern.compile("([12][0-9]{3})[ /-]?([01]?[0-9])[ /-]([0-3]?[0-9])[ \t\r\n\f]*");
Matcher m = pat.matcher(inputDate);
if (m.matches()) {
if (DEBUG) {
System.err.println("YYYYMMDD succeeded");
}
String monthValue = m.group(2);
if (monthValue.length() < 2)//we always use two digit months
{
monthValue = '0' + monthValue;
}
String dayValue = m.group(3);
if (dayValue.length() < 2) {
dayValue = '0' + dayValue;
}
String yearString = m.group(1);
isoDate = yearString + monthValue + dayValue;
return true;
}
return false;
}
/**
* Note: This method copied from {@code DateInstance}; not sure how we tell that it
* is MMDD versus DDMM (sometimes it will be ambiguous).
*
*/
private boolean extractMMDDYY(String inputDate) {
Pattern pat = Pattern.compile("([0-1]??[0-9])[ \t\n\r\f]*[/-][ \t\n\r\f]*([0-3]??[0-9])[ \t\r\n\f]*[/-][ \t\r\n\f]*([0-2]??[0-9]??[0-9][0-9])[ \t\r\n\f]*");
Matcher m = pat.matcher(inputDate);
if (m.matches()) {
if (DEBUG) {
System.err.println("MMDDYY succeeded");
}
String monthValue = m.group(1);
if (monthValue.length() < 2)//we always use two digit months
{
monthValue = '0' + monthValue;
}
String dayValue = m.group(2);
if (dayValue.length() < 2) {
dayValue = '0' + dayValue;
}
String yearString; // always initialized below
if (m.group(3).length() == 2) {
int yearInt = Integer.parseInt(m.group(3));
//Now we add "20" or "19" to the front of the two digit year depending on its value....
if (yearInt < 50) {
yearString = "20" + m.group(3);
} else {
yearString = "19" + m.group(3);
}
} else {
yearString = m.group(3);
}
//lastYearSet = new Integer(yearString).intValue();
isoDate = yearString + monthValue + dayValue;
return true;
}
return false;
}
private Pattern re1 = Pattern.compile("[1-2][0-9]{3}|'[0-9]{2}");
private Pattern re2 = Pattern.compile("[0-9][^0-9].*([0-9]{2})\\s*$");
public boolean extractYear(String inputDate) {
if (DEBUG) {
System.err.println("Extracting year from: |" + inputDate + '|');
}
String extract;
Matcher m1 = re1.matcher(inputDate);
Matcher m2 = re2.matcher(inputDate);
if (m1.find()) {
extract = m1.group(0);
} else if (m2.find()) {
extract = m2.group(1);
} else {
extract = foundMiscYearPattern(inputDate);
if (extract == null || extract.equals("")) {
isoDate = "****";
return false;
}
}
if ( ! "".equals(extract)) {
if (extract.charAt(0) == '\'') {
extract = extract.substring(1);
}
extract = extract.trim();
if (extract.length() == 2) {
if (extract.charAt(0) < '5') {
extract = "20" + extract;
} else {
extract = "19" + extract;
}
}
if (inputDate.charAt(inputDate.length() - 1) == 's') {//decade or century marker
if (extract.charAt(2) == '0') {//e.g., 1900s -> 1900/1999
String endDate = Integer.toString((Integer.parseInt(extract) + 99));
extract = extract + '/' + endDate;
} else {//e.g., 1920s -> 1920/1929
String endDate = Integer.toString((Integer.parseInt(extract) + 9));
extract = extract + '/' + endDate;
}
}
isoDate = extract;
if (DEBUG) {
System.err.println("year extracted:" + extract);
}
return true;
}
isoDate = "****";
return false;
}
/**
* Tries to find a year pattern in the input string that may be somewhat
* odd/non-standard.
*
*/
private static String foundMiscYearPattern(String inputDate) {
String year = "";
if (inputDate.toLowerCase().contains("century")) {
if (inputDate.endsWith("A.D. ")) {
inputDate = inputDate.substring(0, inputDate.length()-5);
if(DEBUG) {
System.out.println("inputDate: |" + inputDate + "|");
}
}
if (inputDate.startsWith("late")) {
inputDate = inputDate.substring(5, inputDate.length());
if(DEBUG) {
System.out.println("inputDate: |" + inputDate + "|");
}
}
if (inputDate.startsWith("early")) {
inputDate = inputDate.substring(6, inputDate.length());
if(DEBUG) {
System.out.println("inputDate: |" + inputDate + "|");
}
}
if (Character.isDigit(inputDate.charAt(0))) {
// just parse number part, assuming last two letters are st/nd/rd
year = QuantifiableEntityNormalizer.normalizedNumberStringQuiet(inputDate.substring(0, inputDate.length() - 2), 1, "", null);
if (year.contains(".")) {//number format issue
year = year.substring(0, year.indexOf('.'));
}
while (year.length() < 4) {
year = year + '*';
}
} else if (QuantifiableEntityNormalizer.ordinalsToValues.containsKey(inputDate)) {
year = Double.toString(QuantifiableEntityNormalizer.ordinalsToValues.getCount(inputDate));
while (year.length() < 4) {
year = year + '*';
}
} else {
if (DEBUG) {
System.out.println("ISODateInstance: Couldn't parse probable century: " + inputDate);
}
year = "";
}
}
return year;
}
private static final Pattern[] extractorArray = {Pattern.compile("[Jj]anuary|JANUARY|[Jj]an\\.?|JAN\\.?"), Pattern.compile("[Ff]ebruary|FEBRUARY|[Ff]eb\\.?|FEB\\.?"), Pattern.compile("[Mm]arch|MARCH|[Mm]ar\\.?|MAR\\.?"), Pattern.compile("[Aa]pril|APRIL|[Aa]pr\\.?|APR\\.?"), Pattern.compile("[Mm]ay|MAY"), Pattern.compile("[Jj]une|JUNE|[Jj]un\\.?|JUN\\.?"), Pattern.compile("[Jj]uly|JULY|[Jj]ul\\.?|JUL\\.?"), Pattern.compile("[Aa]ugust|AUGUST|[Aa]ug\\.?|AUG\\.?"), Pattern.compile("[Ss]eptember|SEPTEMBER|[Ss]ept?\\.?|SEPT?\\.?"), Pattern.compile("[Oo]ctober|OCTOBER|[Oo]ct\\.?|OCT\\.?"), Pattern.compile("[Nn]ovember|NOVEMBER|[Nn]ov\\.?|NOV\\.?"), Pattern.compile("[Dd]ecember|DECEMBER|[Dd]ec(?:\\.|[^aeiou]|$)|DEC(?:\\.|[^aeiou]|$)")}; // avoid matching "decades"!
public boolean extractMonth(String inputDate) {
boolean foundMonth = false;
for (int i = 0; i < 12; i++) {
String extract = "";
Matcher m = extractorArray[i].matcher(inputDate);
if (m.find()) {
extract = m.group(0);
}
if ( ! "".equals(extract)) {
if (!foundMonth) {
if (DEBUG) {
System.err.println("month extracted: " + extract);
}
int monthNum = i + 1;
if (isoDate.length() != 4) {
isoDate = "****";
}
String month = (monthNum < 10) ? "0" + monthNum : String.valueOf(monthNum);
isoDate += month;
foundMonth = true;
}
}
}
return foundMonth;
}
public boolean extractDay(String inputDate) {
for (int a = 0; a < tokens.size(); a++) {
String extract = tokens.get(a);
if (QuantifiableEntityNormalizer.wordsToValues.containsKey(extract)) {
extract = Integer.toString(Double.valueOf(QuantifiableEntityNormalizer.wordsToValues.getCount(extract)).intValue());
} else if (QuantifiableEntityNormalizer.ordinalsToValues.containsKey(extract)) {
extract = Integer.toString(Double.valueOf(QuantifiableEntityNormalizer.ordinalsToValues.getCount(extract)).intValue());
}
extract = extract.replaceAll("[^0-9]", "");
if (!extract.equals("")) {
try {
Integer i = Integer.valueOf(extract);
if (i.intValue() < 32 && i.intValue() > 0) {
if (isoDate.length() < 6) {//should already have year and month
if (isoDate.length() != 4)//throw new RuntimeException("Error extracting dates; should have had month and year but didn't");
{
isoDate = isoDate + "******";
} else {
isoDate = isoDate + "**";
}
}
String day = (i < 10) ? "0" + i : String.valueOf(i);
isoDate = isoDate + day;
return true;
}
} catch (NumberFormatException e) {
System.err.println("Exception in extract Day.");
System.err.println("tokens size :" + tokens.size());
e.printStackTrace();
}
}
}
return false;
}
private static Pattern[] weekdayArray = {Pattern.compile("[Ss]unday"), Pattern.compile("[Mm]onday"), Pattern.compile("[Tt]uesday"), Pattern.compile("[Ww]ednesday"), Pattern.compile("[Tt]hursday"), Pattern.compile("[Ff]riday"), Pattern.compile("[Ss]aturday")};
/**
* This is a backup method if everything else fails. It searches for named
* days of the week and if it finds one, it sets that as the date in lowercase form
*
*/
public boolean extractWeekday(String inputDate) {
for (Pattern p : weekdayArray) {
Matcher m = p.matcher(inputDate);
if (m.find()) {
String extract = m.group(0);
isoDate = extract.toLowerCase();
return true;
}
}
return false;
}
/**
* For testing only
*
*/
public static void main(String[] args) {
Properties props = StringUtils.argsToProperties(args);
String dateProperty = props.getProperty("date");
if (dateProperty != null) {
ISODateInstance d = new ISODateInstance(dateProperty);
System.out.println(dateProperty + " processed as " + d.toString());
}
}
}