Package edu.stanford.nlp.ie.pascal

Source Code of edu.stanford.nlp.ie.pascal.ISODateInstance

package edu.stanford.nlp.ie.pascal;

import edu.stanford.nlp.ie.QuantifiableEntityNormalizer;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.StringUtils;

import java.io.BufferedReader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Map;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* Represents dates and times according to ISO8601 standard while also allowing for
* wild cards - e.g., can represent "21 June" without a year
* (Standard ISO8601 only allows removing less precise annotations (e.g.,
* 200706 rather than 20070621 but not a way to represent 0621 without a year)
* <p/>
* Format stores date and time separately since the majority of current use
* cases involve only one of these items.  Standard ISO 8601 instead
* requires &lt;date&gt;T&lt;time&gt;.
* <p/>
* Ranges are specified within the strings via forward slash.  For example
* 6 June - 8 June is represented ****0606/****0608.  6 June onward is
* ****0606/ and until 8 June is /****0608.
*
* @author Anna Rafferty
*         TODO: add time support - currently just dates are supported
*/
public class ISODateInstance {

  private static final boolean DEBUG = false;
  private ArrayList<String> tokens = new ArrayList<String>();//each token contains some piece of the date, from our input.

  public static final String OPEN_RANGE_AFTER = "A";
  public static final String OPEN_RANGE_BEFORE = "B";
  public static final String BOUNDED_RANGE = "C";
  public static final String NO_RANGE = "";
  public static final int DAY_OF_HALF_MONTH = 15;
  public static final int LAST_DAY_OF_MONTH = 31;//close enough for our purposes
  public static final String MONTH_OF_HALF_YEAR = "07";
  public static final String LAST_MONTH_OF_YEAR = "12";
  /**
   * String of the format &lt;year&gt;&lt;month&gt;&lt;day&gt;.  Representations
   * by week are also allowed. If a more general field (such as year)
   * is not specified when a less general one (such as month) is, the characters
   * normally filled by the more general field are replaced by asterisks. For example,
   * 21 June would be \"****0621\".  Less general fields are simply truncated;
   * for example, June 2007 would be \"200706\".
   */
  private String isoDate = "";

  //Variable for marking if we were unable to parse the string associated with this isoDate
  private boolean unparseable = false;

  //private String isoTime = "";


  /**
   * Creates an empty date instance; you probably
   * don't want this in most cases.
   */
  public ISODateInstance() {

  }

  /**
   * Takes a string that represents a date, and attempts to
   * normalize it into ISO 8601-compatible format.
   *
   */
  public ISODateInstance(String date) {
    extractFields(date);
  }

  public ISODateInstance(String date, String openRangeMarker) {
    extractFields(date);
    //now process the range marker; if a range was found independently, we ignore the marker
    if ( ! ISODateInstance.NO_RANGE.equals(openRangeMarker) && ! isoDate.contains("/")) {
      if (ISODateInstance.OPEN_RANGE_AFTER.equals(openRangeMarker)) {
        isoDate = isoDate + '/';
      } else if (ISODateInstance.OPEN_RANGE_BEFORE.equals(openRangeMarker)) {
        isoDate = '/' + isoDate;
      }
    }
  }

  /**
   * Constructor for a range of dates, beginning at date start and finishing at date end
   *
   */
  public ISODateInstance(ISODateInstance start, ISODateInstance end) {
    String startString = start.getDateString();
    if (start.isRange()) {
      startString = start.getStartDate();
    }
    String endString = end.getDateString();
    if (end.isRange()) {
      endString = end.getEndDate();
    }

    isoDate = startString + '/' + endString;
    unparseable = (start.isUnparseable() || end.isUnparseable());
  }

  /**
   * Construct a new ISODate based on its relation to a referenceDate.
   * relativeDate should be something like "today" or "tomorrow" or "last year"
   * and the resulting ISODate will be the same as the referenceDate, a day later,
   * or a year earlier, respectively.
   *
   */
  public ISODateInstance(ISODateInstance referenceDate, String relativeDate) {
    Pair<DateField, Integer> relation = relativeDateMap.get(relativeDate.toLowerCase());
    if (relation != null) {
      switch (relation.first()) {
        case DAY:
          incrementDay(referenceDate, relation);
          break;
        case MONTH:
          incrementMonth(referenceDate, relation);
          break;
        case YEAR:
          incrementYear(referenceDate, relation);
          break;
      }
    }
  }


  private void incrementYear(ISODateInstance referenceDate, Pair<DateField, Integer> relation) {
    String origDateString = referenceDate.getStartDate();
    String yearString = origDateString.substring(0, 4);
    if (yearString.contains("*")) {
      isoDate = origDateString;
      return;
    }
    isoDate = makeStringYearChange(origDateString, Integer.parseInt(yearString) + relation.second());
  }

  private void incrementMonth(ISODateInstance referenceDate, Pair<DateField, Integer> relation) {
    String origDateString = referenceDate.getStartDate();
    String monthString = origDateString.substring(4, 6);
    if (monthString.contains("*")) {
      isoDate = origDateString;
      return;
    }
    //Month is not a variable
    Integer monthNum = Integer.parseInt(monthString);
    //Check if we're an edge case
    if (((monthNum + relation.second()) > 12) || ((monthNum + relation.second) < 1)) {
      boolean decreasing = ((monthNum + relation.second) < 1);
      int newMonthNum = (monthNum + relation.second()) % 12;
      if (newMonthNum < 0) {
        newMonthNum *= -1;
      }
      //Set the month appropriately
      isoDate = makeStringMonthChange(origDateString, newMonthNum);
      //Increment the year if possible
      String yearString = origDateString.substring(0, 4);
      if (!yearString.contains("*")) {
        //How much we increment depends on above mod
        int numYearsToIncrement = (int) Math.ceil(relation.second() / 12.0);
        if (decreasing) {
          isoDate = makeStringYearChange(isoDate, Integer.parseInt(yearString) - numYearsToIncrement);
        } else {
          isoDate = makeStringYearChange(isoDate, Integer.parseInt(yearString) + numYearsToIncrement);
        }
      }
    } else {
      isoDate = makeStringMonthChange(origDateString, (monthNum + relation.second()));
    }
  }


  private void incrementDay(ISODateInstance referenceDate, Pair<DateField, Integer> relation) {
    String origDateString = referenceDate.getStartDate();
    String dayString = origDateString.substring(origDateString.length() - 2, origDateString.length());
    if (dayString.contains("*")) {
      isoDate = origDateString;
      return;
    }
    //Date is not a variable
    Integer dayNum = Integer.parseInt(dayString);
    String monthString = origDateString.substring(origDateString.length() - 4, origDateString.length() - 2);
    int numDaysInMonth = 30;//default - assume this if month is a variable
    int monthNum = -1;//ie, we don't know the month yet - this remains -1 if the month is a variable
    if (!monthString.contains("*")) {
      //Set appropriate numDaysInMonth and monthNum
      monthNum = Integer.parseInt(monthString);
      numDaysInMonth = daysPerMonth.get(monthNum);
    }

    //Now, find out if we're an edge case (potential to increment month)
    if (dayNum + relation.second() <= numDaysInMonth && dayNum + relation.second() >= 1) {
      //Not an edge case - just increment the day, create a new string, and return
      dayNum += relation.second();
      isoDate = makeStringDayChange(origDateString, dayNum);
      return;
    }

    //Since we're an edge case, the month can't be a variable - if it is a variable, just set this to the reference string
    if (monthNum == -1) {
      isoDate = origDateString;
      return;
    }
    //At this point, neither our day nor our month is a variable
    isoDate = origDateString;
    boolean decreasing = (dayNum + relation.second() < 1);
    //Need to increment the month, set the date appropriately - we need the new month num to set the day appropriately, so do month first
    int newMonthNum;
    //Now, check if we're an edge case for month
    if ((monthNum + 1 > 12 && !decreasing) || (monthNum - 1 < 1 && decreasing)) {
      //First, change the month
      if (decreasing) {
        newMonthNum = 12;
      } else {
        newMonthNum = 1;
      }
      //If we can, increment the year
      //TODO: fix this to work more nicely with variables and thus handle more cases
      String yearString = origDateString.substring(0, 4);
      if (!yearString.contains("*")) {
        if (decreasing) {
          isoDate = makeStringYearChange(isoDate, Integer.parseInt(yearString) - 1);
        } else {
          isoDate = makeStringYearChange(isoDate, Integer.parseInt(yearString) + 1);
        }
      }
    } else {
      //We're not an edge case for month - just increment
      if (decreasing) {
        newMonthNum = monthNum - 1;
      } else {
        newMonthNum = monthNum + 1;
      }
    }
    //do the increment
    isoDate = makeStringMonthChange(isoDate, newMonthNum);
    int newDateNum;
    if (decreasing) {
      newDateNum = -relation.second() + daysPerMonth.get(newMonthNum) - dayNum;
    } else {
      newDateNum = relation.second() - dayNum + daysPerMonth.get(monthNum);
    }
    //Now, change the day in our original string to be appropriate
    isoDate = makeStringDayChange(isoDate, newDateNum);


  }

  /**
   * Changes the day portion of the origDate String to be the String
   * value of newDay in two character format. (e.g., 9 -> "09")
   *
   */
  private static String makeStringDayChange(String origDate, int newDay) {
    String newDayString = (newDay < 10 ? ("0" + newDay) : String.valueOf(newDay));
    return origDate.substring(0, origDate.length() - 2) + newDayString;
  }

  /**
   * Changes the month portion of the origDate String to be the String
   * value of newDay in two character format. (e.g., 9 -> "09")
   *
   */
  private static String makeStringMonthChange(String origDate, int newMonth) {
    String newMonthString = (newMonth < 10 ? ("0" + newMonth) : String.valueOf(newMonth));
    return origDate.substring(0, 4) + newMonthString + origDate.substring(6, 8);
  }

  /**
   * Changes the year portion of the origDate String to be the String
   * value of newDay in two character format. (e.g., 9 -> "09")
   *
   */
  private static String makeStringYearChange(String origDate, int newYear) {
    String newYearString = String.valueOf(newYear);
    while (newYearString.length() < 4) {
      newYearString = '0' + newYearString;//we're compatible with year 1!
    }
    return newYearString + origDate.substring(4, origDate.length());
  }


  /**
   * Enum for the fields *
   */
  public static enum DateField {
    DAY, MONTH, YEAR
  }


  /**
   * Map for mapping a relativeDate String to a pair with the field that should be modified and the amount to modify it *
   */
  public static final Map<String, Pair<DateField, Integer>> relativeDateMap = Generics.newHashMap();

  static {
    //Add entries to the relative datemap
    relativeDateMap.put("today", new Pair<DateField, Integer>(DateField.DAY, 0));
    relativeDateMap.put("tomorrow", new Pair<DateField, Integer>(DateField.DAY, 1));
    relativeDateMap.put("yesterday", new Pair<DateField, Integer>(DateField.DAY, -1));


  }

  public static final Map<Integer, Integer> daysPerMonth = Generics.newHashMap();

  static {
    //Add month entries
    daysPerMonth.put(1, 31);
    daysPerMonth.put(2, 28);
    daysPerMonth.put(3, 31);
    daysPerMonth.put(4, 30);
    daysPerMonth.put(5, 31);
    daysPerMonth.put(6, 30);
    daysPerMonth.put(7, 31);
    daysPerMonth.put(8, 31);
    daysPerMonth.put(9, 30);
    daysPerMonth.put(10, 31);
    daysPerMonth.put(11, 30);
    daysPerMonth.put(12, 31);
  }

  /**
   * Takes a string already formatted in ISODateInstance format
   * (such as one previously written out using toString) and creates
   * a new date instance from it
   *
   */
  public static ISODateInstance fromDateString(String date) {
    ISODateInstance d = new ISODateInstance();
    d.isoDate = date;
    return d;
  }

  public String toString() {
    return isoDate;
  }

  /**
   * Provided for backwards compatibility with DateInstance;
   * returns the same thing as toString()
   *
   */
  public String getDateString() {
    return this.toString();
  }

  /**
   * Uses regexp matching to match  month, day, and year fields
   * TODO: Find a way to mark what;s already been handled in the string
   */
  public boolean extractFields(String inputDate) {

    if (tokens.size() < 2) {
      tokenizeDate(inputDate);
    }
    if (DEBUG) {
      System.err.println("Extracting date: " + inputDate);
    }
    //first we see if it's a hyphen and two parseable dates - if not, we treat it as one date
    Pair<String, String> dateEndpoints = getRangeDates(inputDate);
    if (dateEndpoints != null) {
      ISODateInstance date1 = new ISODateInstance(dateEndpoints.first());
      if (dateEndpoints.first().contains(" ") && !dateEndpoints.second().contains(" ")) {
        //consider whether it's a leading modifier; e.g., "June 8-10" will be split into June 8, and 10 when really we'd like June 8 and June 10
        String date = dateEndpoints.first().substring(0, dateEndpoints.first().indexOf(' ')) + ' ' + dateEndpoints.second();
        ISODateInstance date2 = new ISODateInstance(date);
        if (!date1.isUnparseable() && !date2.isUnparseable()) {
          isoDate = (new ISODateInstance(date1, date2)).getDateString();
          return true;
        }
      }

      ISODateInstance date2 = new ISODateInstance(dateEndpoints.second());
      if (!date1.isUnparseable() && !date2.isUnparseable()) {
        isoDate = (new ISODateInstance(date1, date2)).getDateString();
        return true;
      }
    }

    if (extractYYYYMMDD(inputDate)) {
      return true;
    }
    if (extractMMDDYY(inputDate)) {
      return true;
    }
    boolean passed = false;
    passed = extractYear(inputDate) || passed;
    passed = extractMonth(inputDate) || passed;
    passed = extractDay(inputDate) || passed;

    //slightly hacky, but check for some common modifiers that get grouped into the date
    passed = addExtraRanges(inputDate) || passed;

    if (!passed) {//couldn't parse
      //try one more trick
      unparseable = true;
      boolean weekday = extractWeekday(inputDate);
      if (!weekday) {
        isoDate = inputDate;
      }
    }
    return passed;
  }

  private static String[] rangeIndicators = {"--", "-"};

  /**
   * Attempts to find the two sides of a range in the given string.
   * Uses rangeIndicators to find possible matches.
   *
   */
  private static Pair<String, String> getRangeDates(String inputDate) {
    for (String curIndicator : rangeIndicators) {
      String[] dates = inputDate.split(curIndicator);
      if (dates.length == 2) {
        return new Pair<String, String>(dates[0], dates[1]);
      }
    }
    return null;
  }

  private boolean addExtraRanges(String inputDate) {
    if (isRange()) {
      return false;
    }
    inputDate = inputDate.toLowerCase();
    if (inputDate.contains("half")) {
      if (inputDate.contains("first") && isoDate.length() <= 6) {
        String firstDate = isoDate + "01";
        String secondDate;
        if (isoDate.length() == 4) {//year
          secondDate = isoDate + MONTH_OF_HALF_YEAR;
        } else {//month
          secondDate = isoDate + DAY_OF_HALF_MONTH;
        }
        isoDate = firstDate + '/' + secondDate;
        return true;
      } else if (inputDate.contains("second") && isoDate.length() <= 6) {
        String firstDate;
        String secondDate;
        if (isoDate.length() == 4) {//year
          firstDate = isoDate + MONTH_OF_HALF_YEAR;
          secondDate = isoDate + LAST_MONTH_OF_YEAR;
          isoDate = firstDate + '/' + secondDate;
        } else {//month
          firstDate = isoDate + DAY_OF_HALF_MONTH;
          secondDate = isoDate + LAST_DAY_OF_MONTH;
        }
        isoDate = firstDate + '/' + secondDate;
        return true;
      }
    }

    return false;
  }

  /**
   * Returns true iff this date represents a range
   * The range must have at least a start or end
   * date, but is not guaranteed to have both
   *
   * @return Whether this date represents a range
   */
  public boolean isRange() {
    if (unparseable) {
      return false;
    }
    return isoDate.matches("/");
  }

  /**
   * Returns true iff we were unable to parse the input
   * String associated with this date; in that case,
   * we just store the input string and shortcircuit
   * all of the comparison methods
   *
   */
  public boolean isUnparseable() {
    return unparseable;
  }


  /**
   * Returns this date or if it is a range,
   * the date the range starts.  If the date
   * is of the form /&lt;date&gt;, "" is returned
   *
   * @return Start date of range
   */
  public String getStartDate() {
    if (!isRange()) {
      return isoDate;
    }
    if (isoDate.startsWith("/")) {
      return "";
    }
    return isoDate.split("/")[0];
  }

  /**
   * Returns this date or if it is a range,
   * the date the range ends.  If the date
   * is of the form &lt;date&gt;/, "" is returned
   *
   * @return End date of range
   */
  public String getEndDate() {
    if (!isRange()) {
      return isoDate;
    }
    if (isoDate.endsWith("/")) {
      return "";
    }
    String[] split = isoDate.split("/");
    return split[split.length - 1];
  }

  /* -------------------------- Static Comparison Methods -------------------------- */
  /**
   * Returns true if date1 is after date2
   * <p/>
   * Several tricky cases exist, and implementation tries to
   * go with the common sense interpretation:
   * When a year and a month are given for one, but only a month
   * for the other, it is assumed that both have the same year
   * e.g:
   * ****12 is after 200211
   * <p/>
   * When a year and a month are given for one but only a year
   * for the other, it is assumed that one of these is after the
   * other only if the years differ, e.g.:
   * 2003 is after 200211
   * 2002 is not after 200211
   * 200211 is not after 2002
   *
   * @return Whether date2 is after date1
   */
  static boolean isAfter(String date1, String date2) {
    if (!isDateFormat(date1) || !isDateFormat(date2)) {
      return false;
    }
    boolean after = true;
    //first check years
    String year = date1.substring(0, 4);
    String yearOther = date2.substring(0, 4);
    if (year.contains("*") || yearOther.contains("*")) {
      after = after && checkWildcardCompatibility(year, yearOther);
    } else if (Integer.parseInt(year) > Integer.parseInt(yearOther)) {
      return true;
    } else if (Integer.parseInt(year) < Integer.parseInt(yearOther)) {
      return false;
    }

    if (date1.length() < 6 || date2.length() < 6) {
      if (year.contains("*") || yearOther.contains("*")) {
        return after;
      } else {
        return after && (Integer.parseInt(year) != Integer.parseInt(yearOther));
      }
    }
    //then check months
    String month = date1.substring(4, 6);
    String monthOther = date2.substring(4, 6);
    if (month.contains("*") || monthOther.contains("*")) {
      after = after && checkWildcardCompatibility(month, monthOther);
    } else if (Integer.parseInt(month) > Integer.parseInt(monthOther)) {
      return true;
    } else if (Integer.parseInt(month) < Integer.parseInt(monthOther)) {
      return false;
    }

    if (date1.length() < 8 || date2.length() < 8) {
      if (month.contains("*") || monthOther.contains("*")) {
        return after;
      } else {
        return after && (Integer.parseInt(month) != Integer.parseInt(monthOther));
      }
    }

    //then check days
    String day = date1.substring(6, 8);
    String dayOther = date2.substring(6, 8);
    if (day.contains("*") || dayOther.contains("*")) {
      after = after && checkWildcardCompatibility(day, dayOther);
    } else if (Integer.parseInt(day) > Integer.parseInt(dayOther)) {
      return true;
    } else if (Integer.parseInt(day) <= Integer.parseInt(dayOther)) {
      return false;
    }

    return after;
  }

  /**
   * Right now, we say they're compatible iff one of them is all
   * wildcards or they are equivalent
   *
   */
  @SuppressWarnings("unused")
  private static boolean checkWildcardAfterCompatibility(String txt1, String txt2) {
    if (txt1.length() != txt2.length()) {
      return false;
    }

    for (int i = 0; i < txt1.length(); i++) {
      Character t1 = txt1.charAt(i);
      Character t2 = txt2.charAt(i);
      if (!(t1.equals('*') || t2.equals('*') || t1.equals(t2))) {
        return false;
      }
    }
    return true;
  }

  /**
   * Returns true if the given txt contains only digits and "*" characters;
   * false otherwise
   *
   */
  private static boolean isDateFormat(String txt) {
    String numberValue = txt.replace("*", "");//remove wildcards
    try {
      Integer.parseInt(numberValue);
      return true;
    } catch (Exception e) {
      return false;
    }
  }

  /**
   * Returns true iff date1 could represent the same value as date2
   * e.g.
   * ****07 is compatible with 200207 (and 200207 is compatible with ****07)
   * 200207 is compatible with 20020714 (?maybe need a better idea of use case here...)
   *
   */
  public static boolean isCompatible(String date1, String date2) {
    boolean compatible = true;
    //first check years
    compatible = compatible && isYearCompatible(date1, date2);

    //then check months
    compatible = compatible && isMonthCompatible(date1, date2);

    //then check days
    compatible = compatible && isDayCompatible(date1, date2);

    return compatible;

  }

  /**
   * Checks if the years represented by the two dates are compatible
   * If either lacks a year, we return true.
   *
   */
  private static boolean isYearCompatible(String date1, String date2) {
    boolean compatible = true;
    if (date1.length() < 4 || date2.length() < 4) {
      return compatible;
    }
    //first check years
    String year = date1.substring(0, 4);
    String yearOther = date2.substring(0, 4);
    if (year.contains("*") || yearOther.contains("*")) {
      compatible = compatible && checkWildcardCompatibility(year, yearOther);
    } else if (!year.equals(yearOther)) {
      return false;
    }
    return compatible;
  }

  /**
   * Checks if the months represented by the two dates are compatible
   * If either lacks a month, we return true.
   *
   */
  private static boolean isMonthCompatible(String date1, String date2) {
    boolean compatible = true;
    if (date1.length() < 6 || date2.length() < 6) {
      return compatible;
    }
    //then check months
    String month = date1.substring(4, 6);
    String monthOther = date2.substring(4, 6);
    if (month.contains("*") || monthOther.contains("*")) {
      compatible = (compatible && checkWildcardCompatibility(month, monthOther));
    } else if (!month.equals(monthOther)) {
      return false;
    }
    return compatible;
  }

  /**
   * Checks if the days represented by the two dates are compatible
   * If either lacks a day, we return true.
   *
   */
  private static boolean isDayCompatible(String date1, String date2) {
    boolean compatible = true;
    if (date1.length() < 8 || date2.length() < 8) {
      return compatible;
    }
    //then check days
    String day = date1.substring(6, 8);
    String dayOther = date2.substring(6, 8);
    if (day.contains("*") || dayOther.contains("*")) {
      compatible = compatible && checkWildcardCompatibility(day, dayOther);
    } else if (!day.equals(dayOther)) {
      return false;
    }
    return compatible;
  }


  /**
   */
  private static boolean checkWildcardCompatibility(String txt1, String txt2) {
    if (txt1.length() != txt2.length()) {
      return false;
    }
    for (int i = 0; i < txt1.length(); i++) {
      Character t1 = txt1.charAt(i);
      Character t2 = txt2.charAt(i);
      if (!(t1.equals('*') || t2.equals('*') || t1.equals(t2))) {
        return false;
      }
    }
    return true;
  }


  /* -------------------------- Instance Comparison Methods -------------------------- */
  /**
   * Returns true iff this date
   * contains the date represented by other.
   * A range contains a date if it
   * is equal to or after the start date and equal to or
   * before the end date.  For open ranges, contains
   * is also inclusive of the one end point.
   *
   */
  public boolean contains(ISODateInstance other) {
    if (this.isUnparseable() || other.isUnparseable()) {
      return this.isoDate.equals(other.isoDate);
    }
    String start = this.getStartDate();
    if (!start.equals("")) {//we have a start date, need to make sure other is after it
      String startOther = other.getStartDate();
      if (startOther.equals("")) {
        return false;//incompatible
      } else {
        if (!isAfter(startOther, start)) {
          return false;
        }
      }
    }
    //now we've found out that the start date is appropriate, check the end date
    String end = this.getEndDate();
    if (!end.equals("")) {
      String endOther = other.getEndDate();
      if (endOther.equals("")) {
        return false;
      } else {
        if (!isAfter(end, endOther)) {
          return false;
        }
      }
    }
    return true;//passes both start and end
  }


  /**
   * Returns true if this date instance is after
   * the given dateString.  If this date instance
   * is a range, then returns true only if both
   * start and end dates are after dateString.
   * <p/>
   * Several tricky cases exist, and implementation tries to
   * go with the commonsense interpretation:
   * When a year and a month are given for one, but only a month
   * for the other, it is assumed that both have the same year
   * e.g:
   * ****12 is after 200211
   * <p/>
   * When a year and a month are given for one but only a year
   * for the other, it is assumed that one of these is after the
   * other only if the years differ, e.g.:
   * 2003 is after 200211
   * 2002 is not after 200211
   * 200211 is not after 2002
   *
   */
  public boolean isAfter(String dateString) {
    if (this.isUnparseable()) {
      return false;
    }
    if (!isDateFormat(dateString)) {
      return false;
    }
    return isAfter(this.getEndDate(), dateString);
  }

  public boolean isCompatibleDate(ISODateInstance other) {
    if (this.isUnparseable() || other.isUnparseable()) {
      return this.isoDate.equals(other.isoDate);
    }

    //first see if either is a range
    if (this.isRange()) {
      return this.contains(other);
    } else if (other.isRange()) {
      return false;//not compatible if other is range and this isn't
    } else {
      return isCompatible(isoDate, other.getDateString());
    }
  }

  /**
   * Looks if the years for the two dates are compatible.
   * This method does not consider ranges and uses only the
   * start date.
   *
   */
  public boolean isYearCompatible(ISODateInstance other) {
    if (this.isUnparseable() || other.isUnparseable()) {
      return this.isoDate.equals(other.isoDate);
    }

    return isYearCompatible(isoDate, other.getDateString());
  }

  /**
   * Looks if the months for the two dates are compatible.
   * This method does not consider ranges and uses only the
   * start date.
   *
   */
  public boolean isMonthCompatible(ISODateInstance other) {
    if (this.isUnparseable() || other.isUnparseable()) {
      return this.isoDate.equals(other.isoDate);
    }

    return isMonthCompatible(isoDate, other.getDateString());
  }

  /**
   * Looks if the days for the two dates are compatible.
   * This method does not consider ranges and uses only the
   * start date.
   *
   */
  public boolean isDayCompatible(ISODateInstance other) {
    if (this.isUnparseable() || other.isUnparseable()) {
      return this.isoDate.equals(other.isoDate);
    }

    return isDayCompatible(isoDate, other.getDateString());
  }


  /* -------------------------- Tokenization and Field Extraction -------------------------- */
  //These methods are taken directly from or modified slightly from {@link DateInstance}

  private void tokenizeDate(String inputDate) {
    tokens = new ArrayList<String>();
    Pattern pat = Pattern.compile("[-]");
    if (inputDate == null) {
      System.out.println("Null input date");
    }
    Matcher m = pat.matcher(inputDate);
    String str = m.replaceAll(" - ");
    str = str.replaceAll(",", " ");
    PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(new BufferedReader(new StringReader(str)));
    while (tokenizer.hasNext()) {
      Word nextToken = tokenizer.next();
      tokens.add(nextToken.toString());
    }
    if(DEBUG) {
      System.out.println("tokens:" + tokens);
    }
  }


  /**
   * This method does YYYY-MM-DD style ISO date formats
   *
   * @return whether it worked.
   */
  private boolean extractYYYYMMDD(String inputDate) {
    Pattern pat = Pattern.compile("([12][0-9]{3})[ /-]?([01]?[0-9])[ /-]([0-3]?[0-9])[ \t\r\n\f]*");
    Matcher m = pat.matcher(inputDate);
    if (m.matches()) {
      if (DEBUG) {
        System.err.println("YYYYMMDD succeeded");
      }
      String monthValue = m.group(2);
      if (monthValue.length() < 2)//we always use two digit months
      {
        monthValue = '0' + monthValue;
      }
      String dayValue = m.group(3);
      if (dayValue.length() < 2) {
        dayValue = '0' + dayValue;
      }
      String yearString = m.group(1);
      isoDate = yearString + monthValue + dayValue;
      return true;
    }
    return false;
  }

  /**
   * Note: This method copied from {@code DateInstance}; not sure how we tell that it
   * is MMDD versus DDMM (sometimes it will be ambiguous).
   *
   */
  private boolean extractMMDDYY(String inputDate) {
    Pattern pat = Pattern.compile("([0-1]??[0-9])[ \t\n\r\f]*[/-][ \t\n\r\f]*([0-3]??[0-9])[ \t\r\n\f]*[/-][ \t\r\n\f]*([0-2]??[0-9]??[0-9][0-9])[ \t\r\n\f]*");
    Matcher m = pat.matcher(inputDate);
    if (m.matches()) {
      if (DEBUG) {
        System.err.println("MMDDYY succeeded");
      }
      String monthValue = m.group(1);
      if (monthValue.length() < 2)//we always use two digit months
      {
        monthValue = '0' + monthValue;
      }
      String dayValue = m.group(2);
      if (dayValue.length() < 2) {
        dayValue = '0' + dayValue;
      }
      String yearString; // always initialized below
      if (m.group(3).length() == 2) {
        int yearInt = Integer.parseInt(m.group(3));
        //Now we add "20" or "19" to the front of the two digit year depending on its value....
        if (yearInt < 50) {
          yearString = "20" + m.group(3);
        } else {
          yearString = "19" + m.group(3);
        }

      } else {
        yearString = m.group(3);
      }
      //lastYearSet = new Integer(yearString).intValue();
      isoDate = yearString + monthValue + dayValue;
      return true;
    }
    return false;
  }

  private Pattern re1 = Pattern.compile("[1-2][0-9]{3}|'[0-9]{2}");
  private Pattern re2 = Pattern.compile("[0-9][^0-9].*([0-9]{2})\\s*$");

  public boolean extractYear(String inputDate) {
    if (DEBUG) {
      System.err.println("Extracting year from: |" + inputDate + '|');
    }
    String extract;
    Matcher m1 = re1.matcher(inputDate);
    Matcher m2 = re2.matcher(inputDate);
    if (m1.find()) {
      extract = m1.group(0);
    } else if (m2.find()) {
      extract = m2.group(1);
    } else {
      extract = foundMiscYearPattern(inputDate);
      if (extract == null || extract.equals("")) {
        isoDate = "****";
        return false;
      }
    }

    if ( ! "".equals(extract)) {
      if (extract.charAt(0) == '\'') {
        extract = extract.substring(1);
      }
      extract = extract.trim();
      if (extract.length() == 2) {
        if (extract.charAt(0) < '5') {
          extract = "20" + extract;
        } else {
          extract = "19" + extract;
        }
      }
      if (inputDate.charAt(inputDate.length() - 1) == 's') {//decade or century marker
        if (extract.charAt(2) == '0') {//e.g., 1900s -> 1900/1999
          String endDate = Integer.toString((Integer.parseInt(extract) + 99));
          extract = extract + '/' + endDate;
        } else {//e.g., 1920s -> 1920/1929
          String endDate = Integer.toString((Integer.parseInt(extract) + 9));
          extract = extract + '/' + endDate;
        }
      }
      isoDate = extract;
      if (DEBUG) {
        System.err.println("year extracted:" + extract);
      }
      return true;
    }
    isoDate = "****";
    return false;
  }

  /**
   * Tries to find a year pattern in the input string that may be somewhat
   * odd/non-standard.
   *
   */
  private static String foundMiscYearPattern(String inputDate) {
    String year = "";
    if (inputDate.toLowerCase().contains("century")) {
      if (inputDate.endsWith("A.D. ")) {
        inputDate = inputDate.substring(0, inputDate.length()-5);
        if(DEBUG) {
          System.out.println("inputDate: |" + inputDate + "|");
        }
      }
      if (inputDate.startsWith("late")) {
        inputDate = inputDate.substring(5, inputDate.length());
        if(DEBUG) {
          System.out.println("inputDate: |" + inputDate + "|");
        }
      }
      if (inputDate.startsWith("early")) {
        inputDate = inputDate.substring(6, inputDate.length());
        if(DEBUG) {
          System.out.println("inputDate: |" + inputDate + "|");
        }
      }
      if (Character.isDigit(inputDate.charAt(0))) {
        // just parse number part, assuming last two letters are st/nd/rd
        year = QuantifiableEntityNormalizer.normalizedNumberStringQuiet(inputDate.substring(0, inputDate.length() - 2), 1, "", null);
        if (year.contains(".")) {//number format issue
          year = year.substring(0, year.indexOf('.'));
        }
        while (year.length() < 4) {
          year = year + '*';
        }
      } else if (QuantifiableEntityNormalizer.ordinalsToValues.containsKey(inputDate)) {
        year = Double.toString(QuantifiableEntityNormalizer.ordinalsToValues.getCount(inputDate));
        while (year.length() < 4) {
          year = year + '*';
        }
      } else {
        if (DEBUG) {
          System.out.println("ISODateInstance: Couldn't parse probable century: " + inputDate);
        }
        year = "";
      }
    }
    return year;
  }

  private static final Pattern[] extractorArray = {Pattern.compile("[Jj]anuary|JANUARY|[Jj]an\\.?|JAN\\.?"), Pattern.compile("[Ff]ebruary|FEBRUARY|[Ff]eb\\.?|FEB\\.?"), Pattern.compile("[Mm]arch|MARCH|[Mm]ar\\.?|MAR\\.?"), Pattern.compile("[Aa]pril|APRIL|[Aa]pr\\.?|APR\\.?"), Pattern.compile("[Mm]ay|MAY"), Pattern.compile("[Jj]une|JUNE|[Jj]un\\.?|JUN\\.?"), Pattern.compile("[Jj]uly|JULY|[Jj]ul\\.?|JUL\\.?"), Pattern.compile("[Aa]ugust|AUGUST|[Aa]ug\\.?|AUG\\.?"), Pattern.compile("[Ss]eptember|SEPTEMBER|[Ss]ept?\\.?|SEPT?\\.?"), Pattern.compile("[Oo]ctober|OCTOBER|[Oo]ct\\.?|OCT\\.?"), Pattern.compile("[Nn]ovember|NOVEMBER|[Nn]ov\\.?|NOV\\.?"), Pattern.compile("[Dd]ecember|DECEMBER|[Dd]ec(?:\\.|[^aeiou]|$)|DEC(?:\\.|[^aeiou]|$)")}; // avoid matching "decades"!

  public boolean extractMonth(String inputDate) {
    boolean foundMonth = false;

    for (int i = 0; i < 12; i++) {
      String extract = "";
      Matcher m = extractorArray[i].matcher(inputDate);
      if (m.find()) {
        extract = m.group(0);
      }
      if ( ! "".equals(extract)) {
        if (!foundMonth) {
          if (DEBUG) {
            System.err.println("month extracted: " + extract);
          }
          int monthNum = i + 1;
          if (isoDate.length() != 4) {
            isoDate = "****";
          }
          String month = (monthNum < 10) ? "0" + monthNum : String.valueOf(monthNum);
          isoDate += month;
          foundMonth = true;
        }
      }
    }
    return foundMonth;
  }

  public boolean extractDay(String inputDate) {
    for (int a = 0; a < tokens.size(); a++) {
      String extract = tokens.get(a);
      if (QuantifiableEntityNormalizer.wordsToValues.containsKey(extract)) {
        extract = Integer.toString(Double.valueOf(QuantifiableEntityNormalizer.wordsToValues.getCount(extract)).intValue());
      } else if (QuantifiableEntityNormalizer.ordinalsToValues.containsKey(extract)) {
        extract = Integer.toString(Double.valueOf(QuantifiableEntityNormalizer.ordinalsToValues.getCount(extract)).intValue());
      }
      extract = extract.replaceAll("[^0-9]", "");
      if (!extract.equals("")) {
        try {
          Integer i = Integer.valueOf(extract);
          if (i.intValue() < 32 && i.intValue() > 0) {
            if (isoDate.length() < 6) {//should already have year and month
              if (isoDate.length() != 4)//throw new RuntimeException("Error extracting dates; should have had month and year but didn't");
              {
                isoDate = isoDate + "******";
              } else {
                isoDate = isoDate + "**";
              }
            }
            String day = (i < 10) ? "0" + i : String.valueOf(i);
            isoDate = isoDate + day;
            return true;
          }
        } catch (NumberFormatException e) {
          System.err.println("Exception in extract Day.");
          System.err.println("tokens size :" + tokens.size());
          e.printStackTrace();
        }
      }
    }
    return false;
  }

  private static Pattern[] weekdayArray = {Pattern.compile("[Ss]unday"), Pattern.compile("[Mm]onday"), Pattern.compile("[Tt]uesday"), Pattern.compile("[Ww]ednesday"), Pattern.compile("[Tt]hursday"), Pattern.compile("[Ff]riday"), Pattern.compile("[Ss]aturday")};

  /**
   * This is a backup method if everything else fails.  It searches for named
   * days of the week and if it finds one, it sets that as the date in lowercase form
   *
   */
  public boolean extractWeekday(String inputDate) {
    for (Pattern p : weekdayArray) {
      Matcher m = p.matcher(inputDate);
      if (m.find()) {
        String extract = m.group(0);
        isoDate = extract.toLowerCase();
        return true;
      }
    }
    return false;
  }

  /**
   * For testing only
   *
   */
  public static void main(String[] args) {
    Properties props = StringUtils.argsToProperties(args);
    String dateProperty = props.getProperty("date");
    if (dateProperty != null) {
      ISODateInstance d = new ISODateInstance(dateProperty);
      System.out.println(dateProperty + " processed as " + d.toString());
    }
  }


}
TOP

Related Classes of edu.stanford.nlp.ie.pascal.ISODateInstance

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.