Source Code of edu.stanford.nlp.time.TimeExpressionExtractorImpl

package edu.stanford.nlp.time;


import edu.stanford.nlp.ie.NumberNormalizer;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.tokensregex.*;
import edu.stanford.nlp.pipeline.ChunkAnnotationUtils;
import edu.stanford.nlp.util.CoreMap;


import java.text.SimpleDateFormat;
import java.util.*;
import java.util.logging.Level;
import java.util.logging.Logger;


/**
 * Extracts time expressions
 *
 * @author Angel Chang
 */
@SuppressWarnings("unchecked")
public class TimeExpressionExtractorImpl implements TimeExpressionExtractor {


  protected static final Logger logger = Logger.getLogger(TimeExpressionExtractorImpl.class.getName());


  // Patterns for extracting time expressions
  TimeExpressionPatterns timexPatterns;


  CoreMapExpressionExtractor expressionExtractor;


  // Options
  Options options;


  public TimeExpressionExtractorImpl()
  {
    init(new Options());
  }


  public TimeExpressionExtractorImpl(String name, Properties props)
  {
    init(name, props);
  }


  @Override
  public void init(String name, Properties props)
  {
    init(new Options(name, props));
  }


  @Override
  public void init(Options options)
  {
    this.options = options;
    // TODO: does not allow for multiple loggers
    if (options.verbose) {
      logger.setLevel(Level.FINE);
    } else {
      logger.setLevel(Level.SEVERE);
    }
    NumberNormalizer.setVerbose(options.verbose);
    if (options.grammarFilename == null) {
      options.grammarFilename = Options.DEFAULT_GRAMMAR_FILES;
      logger.warning("Time rules file is not specified: using default rules at " + options.grammarFilename);
    }
    timexPatterns = new GenericTimeExpressionPatterns(options);
    this.expressionExtractor = timexPatterns.createExtractor();
    this.expressionExtractor.setLogger(logger);
  }


  @Override
  public List<CoreMap> extractTimeExpressionCoreMaps(CoreMap annotation, CoreMap docAnnotation) {
    SUTime.TimeIndex timeIndex = null;
    String docDate = null;
    if (docAnnotation != null) {
      timeIndex = docAnnotation.get(TimeExpression.TimeIndexAnnotation.class);
      if (timeIndex == null) {
        docAnnotation.set(TimeExpression.TimeIndexAnnotation.class, timeIndex = new SUTime.TimeIndex());
      }
      docDate = docAnnotation.get(CoreAnnotations.DocDateAnnotation.class);
      if(docDate == null){
        Calendar cal = docAnnotation.get(CoreAnnotations.CalendarAnnotation.class);
        if(cal == null){
          logger.log(Level.WARNING, "No document date specified");
        } else {
          SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd:hh:mm:ss");
          docDate = dateFormat.format(cal.getTime());
        }
      }
    } else {
      timeIndex = new SUTime.TimeIndex();
    }
    if ("".equals(docDate)) {
      docDate = null;
    }
    if (timeIndex.docDate == null && docDate != null) {
      try {
        // TODO: have more robust parsing of document date?  docDate may not have century....
        // TODO: if docDate didn't change, we can cache the parsing of the docDate and not repeat it for every sentence
        timeIndex.docDate = SUTime.parseDateTime(docDate,true);
      } catch (Exception e) {
        throw new RuntimeException("Could not parse date string: [" + docDate + "]", e);
      }
    }
    String sectionDate = annotation.get(CoreAnnotations.SectionDateAnnotation.class);
    String refDate = (sectionDate != null)? sectionDate:docDate;
    return extractTimeExpressionCoreMaps(annotation, refDate, timeIndex);
  }


  @Override
  public List<CoreMap> extractTimeExpressionCoreMaps(CoreMap annotation, String docDate)
  {
    SUTime.TimeIndex timeIndex = new SUTime.TimeIndex();
    return extractTimeExpressionCoreMaps(annotation, docDate, timeIndex);
  }


  public List<CoreMap> extractTimeExpressionCoreMaps(CoreMap annotation, String docDate, SUTime.TimeIndex timeIndex)
  {
    List<TimeExpression> timeExpressions = extractTimeExpressions(annotation, docDate, timeIndex);
    return toCoreMaps(annotation, timeExpressions, timeIndex);
  }


  public void finalize(CoreMap docAnnotation) {
    docAnnotation.remove(TimeExpression.TimeIndexAnnotation.class);
  }


  private List<CoreMap> toCoreMaps(CoreMap annotation, List<TimeExpression> timeExpressions, SUTime.TimeIndex timeIndex)
  {
    if (timeExpressions == null) return null;
    List<CoreMap> coreMaps = new ArrayList<CoreMap>(timeExpressions.size());
    for (TimeExpression te:timeExpressions) {
      CoreMap cm = te.getAnnotation();
      SUTime.Temporal temporal = te.getTemporal();
      if (temporal != null) {
        String origText = annotation.get(CoreAnnotations.TextAnnotation.class);
        String text = cm.get(CoreAnnotations.TextAnnotation.class);
        if (origText != null) {
          // Make sure the text is from original (and not from concatenated tokens)
          ChunkAnnotationUtils.annotateChunkText(cm, annotation);
          text = cm.get(CoreAnnotations.TextAnnotation.class);
        }
        Map<String,String> timexAttributes;
        try {
          timexAttributes = temporal.getTimexAttributes(timeIndex);
          if (options.includeRange) {
            SUTime.Temporal rangeTemporal = temporal.getRange();
            if (rangeTemporal != null) {
              timexAttributes.put("range", rangeTemporal.toString());
            }
          }
        } catch (Exception e) {
          logger.log(Level.WARNING, "Failed to get attributes from " + text + ", timeIndex " + timeIndex, e);
          continue;
        }
        Timex timex;
        try {
          timex = Timex.fromMap(text, timexAttributes);
        } catch (Exception e) {
          logger.log(Level.WARNING, "Failed to process timex " + text + " with attributes " + timexAttributes, e);
          continue;
        }
        assert timex != null;  // Timex.fromMap never returns null and if it exceptions, we've already done a continue
        cm.set(TimeAnnotations.TimexAnnotation.class, timex);
        coreMaps.add(cm);
      }
    }
    return coreMaps;
  }


  public List<TimeExpression> extractTimeExpressions(CoreMap annotation, String refDateStr, SUTime.TimeIndex timeIndex) {
    SUTime.Time refDate = null;
    if (refDateStr != null) {
      try {
        // TODO: have more robust parsing of document date?  docDate may not have century....
        // TODO: if docDate didn't change, we can cache the parsing of the docDate and not repeat it for every sentence
        refDate = SUTime.parseDateTime(refDateStr,true);
      } catch (Exception e) {
        throw new RuntimeException("Could not parse date string: [" + refDateStr + "]", e);
      }
    }
    return extractTimeExpressions(annotation, refDate, timeIndex);
  }


  public List<TimeExpression> extractTimeExpressions(CoreMap annotation, SUTime.Time refDate, SUTime.TimeIndex timeIndex)
  {
    if (!annotation.containsKey(CoreAnnotations.NumerizedTokensAnnotation.class)) {
      List<CoreMap> mergedNumbers = NumberNormalizer.findAndMergeNumbers(annotation);
      annotation.set(CoreAnnotations.NumerizedTokensAnnotation.class, mergedNumbers);
    }


    List<? extends MatchedExpression> matchedExpressions = expressionExtractor.extractExpressions(annotation);
    List<TimeExpression> timeExpressions = new ArrayList<TimeExpression>(matchedExpressions.size());
    for (MatchedExpression expr : matchedExpressions) {
      // Make sure we have the correct type (instead of just MatchedExpression)
      //timeExpressions.add(TimeExpression.TimeExpressionConverter.apply(expr));


      // TODO: Fix the extraction pipeline so it creates TimeExpression instead of MatchedExpressions
      // For now, grab the time expression from the annotation (this is good, so we don't have duplicate copies)
      TimeExpression annoTe = expr.getAnnotation().get( TimeExpression.Annotation.class );
      if (annoTe != null) {
        timeExpressions.add(annoTe);
      }
    }
    // We cache the document date in the timeIndex
    if (timeIndex.docDate == null) {
      if (refDate != null) timeIndex.docDate = refDate;
      else if (options.searchForDocDate) {
        // there was no document date but option was set to look for document date
        timeIndex.docDate = findReferenceDate(timeExpressions);
      }
    }
    // Didn't have a reference date - try using cached doc date
    if (refDate == null) refDate = timeIndex.docDate;


    // Some resolving is done even if refDate null...
    if ( timeExpressions != null) {
      resolveTimeExpressions(annotation, timeExpressions, refDate);
    }
    if (options.restrictToTimex3) {
      // Keep only TIMEX3 compatible timeExpressions
      List<TimeExpression> kept = new ArrayList<TimeExpression>(timeExpressions.size());
      for (TimeExpression te:timeExpressions) {
        if (te.getTemporal() != null && te.getTemporal().getTimexValue() != null) {
          kept.add(te);
        } else {
          List<? extends CoreMap> children = te.getAnnotation().get(TimeExpression.ChildrenAnnotation.class);
          if (children != null) {
            for (CoreMap child:children) {
              TimeExpression childTe = child.get(TimeExpression.Annotation.class);
              if (childTe != null) {
                resolveTimeExpression(annotation, childTe, refDate);
                if (childTe.getTemporal() != null && childTe.getTemporal().getTimexValue() != null) {
                  kept.add(childTe);
                }
              }
            }
          }
        }
      }
      timeExpressions = kept;
    }


    // Add back nested time expressions for ranges....
    // For now only one level of nesting...
    if (options.includeNested) {
      List<TimeExpression> nestedTimeExpressions = new ArrayList<TimeExpression>();
      for (TimeExpression te:timeExpressions) {
        if (te.isIncludeNested())  {
          List<? extends CoreMap> children = te.getAnnotation().get(TimeExpression.ChildrenAnnotation.class);
          if (children != null) {
            for (CoreMap child:children) {
              TimeExpression childTe = child.get(TimeExpression.Annotation.class);
              if (childTe != null) {
                nestedTimeExpressions.add(childTe);
              }
            }
          }
        }
      }
      resolveTimeExpressions(annotation, nestedTimeExpressions, refDate);
      timeExpressions.addAll(nestedTimeExpressions);
    }
    Collections.sort(timeExpressions, MatchedExpression.EXPR_TOKEN_OFFSETS_NESTED_FIRST_COMPARATOR);
    // Some resolving is done even if refDate null...
    if (timeExpressions != null) {
      resolveTimeExpressions(annotation, timeExpressions, refDate);
    }
    return timeExpressions;
  }


  private void resolveTimeExpression(CoreMap annotation, TimeExpression te, SUTime.Time docDate)
  {
    SUTime.Temporal temporal = te.getTemporal();
    if (temporal != null) {
      // TODO: use correct time for anchor
      try {
        int flags = timexPatterns.determineRelFlags(annotation, te);
        //int flags = 0;
        SUTime.Temporal grounded = temporal.resolve(docDate, flags);
        if (grounded == null) {
          logger.warning("Error resolving " + temporal + ", using docDate=" + docDate);
        }
        if (grounded != temporal) {
          te.origTemporal = temporal;
          te.setTemporal(grounded);
        }
      } catch (Exception ex) {
        logger.log(Level.WARNING, "Error resolving " + temporal, ex);
      }
    }
  }


  private void resolveTimeExpressions(CoreMap annotation, List<TimeExpression> timeExpressions, SUTime.Time docDate)
  {
    for (TimeExpression te:timeExpressions) {
      resolveTimeExpression(annotation, te, docDate);
    }
  }


  private SUTime.Time findReferenceDate(List<TimeExpression> timeExpressions) {
    // Find first full date in this annotation with year, month, and day
    for (TimeExpression te:timeExpressions) {
      SUTime.Temporal t = te.getTemporal();
      if (t instanceof SUTime.Time) {
        if (t.isGrounded()) {
          return t.getTime();
        } else if (t instanceof SUTime.PartialTime) {
          if (JodaTimeUtils.hasYYYYMMDD(t.getTime().getJodaTimePartial())) {
            return t.getTime();
          } else if (JodaTimeUtils.hasYYMMDD(t.getTime().getJodaTimePartial())) {
            return t.getTime().resolve(SUTime.getCurrentTime()).getTime();
          }
        }
      }
    }
    return null;
  }
}
Source Code of edu.stanford.nlp.time.TimeExpressionExtractorImpl

Related Classes of edu.stanford.nlp.time.TimeExpressionExtractorImpl