Source Code of org.apache.ctakes.ytex.uima.annotators.SentenceDetector

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.ctakes.ytex.uima.annotators;


import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;


import opennlp.tools.sentdetect.DefaultSDContextGenerator;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.util.InvalidFormatException;


import org.apache.ctakes.core.resource.FileLocator;
import org.apache.ctakes.core.sentence.EndOfSentenceScannerImpl;
import org.apache.ctakes.core.sentence.SentenceDetectorCtakes;
import org.apache.ctakes.core.util.ParamUtil;
import org.apache.ctakes.typesystem.type.textspan.Segment;
import org.apache.ctakes.typesystem.type.textspan.Sentence;
import org.apache.log4j.Logger;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.analysis_engine.annotator.AnnotatorProcessException;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.JFSIndexRepository;
import org.apache.uima.resource.ResourceAccessException;
import org.apache.uima.resource.ResourceInitializationException;


import com.google.common.base.Strings;


/**
 * Wraps the OpenNLP sentence detector in a UIMA annotator.
 * 
 * Changes:
 * <ul>
 * <li>split on paragraphs before feeding into maximum entropy model
 * <li>don't split on newlines
 * <li>split on periods
 * <li>split on semi-structured text such as checkboxes
 * </ul>
 * 
 * Parameters (optional):
 * <ul>
 * <li>paragraphPattern: regex to split paragraphs. default PARAGRAPH_PATTERN
 * <li>acronymPattern: default ACRONYM_PATTERN. If the text preceding period
 * matches this pattern, we do not split at the period
 * <li>periodPattern: default PERIOD_PATTERN. If the text following period
 * matches this pattern, we split it.
 * <li>splitPattern: regex to split at semi-structured fields. default
 * SPLIT_PATTERN
 * </ul>
 * 
 * 
 * 
 * @author Mayo Clinic
 * @author vijay
 */
public class SentenceDetector extends JCasAnnotator_ImplBase {
  /**
   * Value is "SegmentsToSkip". This parameter specifies which sections to
   * skip. The parameter should be of type String, should be multi-valued and
   * optional.
   */
  public static final String PARAM_SEGMENTS_TO_SKIP = "SegmentsToSkip";


  // LOG4J logger based on class name
  private Logger logger = Logger.getLogger(getClass().getName());


  public static final String SD_MODEL_FILE_PARAM = "SentenceModelFile";


  private opennlp.tools.sentdetect.SentenceModel sdmodel;
  /**
   * vng change split paragraphs on this pattern
   */
  public static final String PARAGRAPH_PATTERN = "(?m):\\r{0,1}\\n|\\r{0,1}\\n\\r{0,1}\\n";
  /**
   * vng change split sentences periods that do not have this acronym
   * preceding it
   */
  public static final String ACRONYM_PATTERN = "(?m)Dr\\z|Ms\\z|Mr\\z|Mrs\\z|Ms\\z|\\p{Upper}\\z";
  /**
   * vng change split sentences periods after which this pattern is seen
   */
  public static final String PERIOD_PATTERN = "(?m)\\A\\s+\\p{Upper}|\\A\\s+\\d\\.";
  /**
   * vng change split sentences on these patterns
   */
  public static final String SPLIT_PATTERN = "(?im)\\n[\\(\\[]\\s*[yesxno]{0,3}\\s*[\\)\\]]|[\\(\\[]\\s*[yesxno]{0,3}\\s*[\\)\\]]\\s*\\r{0,1}\\n|^[^:\\r\\n]{3,20}\\:[^\\r\\n]{3,20}$";
  /**
   * vng change
   */
  private Pattern paragraphPattern;
  /**
   * vng change
   */
  private Pattern splitPattern;
  /**
   * vng change
   */
  private Pattern periodPattern;
  /**
   * vng change
   */
  private Pattern acronymPattern;


  private UimaContext context;


  private Set<?> skipSegmentsSet;


  private SentenceDetectorCtakes sentenceDetector;


  private String NEWLINE = "\n";


  private int sentenceCount = 0;


  public void initialize(UimaContext aContext)
      throws ResourceInitializationException {


    super.initialize(aContext);
    logger.info(Arrays.asList(aContext.getConfigParameterNames()));


    context = aContext;
    try {
      configInit();
    } catch (Exception ace) {
      throw new ResourceInitializationException(ace);
    }
  }


  /**
   * Reads configuration parameters.
   * 
   * @throws ResourceAccessException
   * @throws IOException 
   * @throws InvalidFormatException 
   */
  private void configInit() throws ResourceAccessException, InvalidFormatException, IOException {


    String sdModelPath = (String) context
        .getConfigParameterValue(SD_MODEL_FILE_PARAM);
      InputStream is = FileLocator.getAsStream(sdModelPath);
      logger.info("Sentence detector model file: " + sdModelPath);
      sdmodel = new SentenceModel(is);
      is.close();
      EndOfSentenceScannerImpl eoss = new EndOfSentenceScannerImpl();
      char[] eosc = eoss.getEndOfSentenceCharacters();
      // SentenceDContextGenerator cg = new SentenceDContextGenerator();
      DefaultSDContextGenerator cg = new DefaultSDContextGenerator(eosc);
      sentenceDetector = new SentenceDetectorCtakes(
          sdmodel.getMaxentModel(), cg, eoss);


      skipSegmentsSet = ParamUtil.getStringParameterValuesSet(
          PARAM_SEGMENTS_TO_SKIP, context);
      // vng change begin
      paragraphPattern = compilePatternCheck("paragraphPattern",
          PARAGRAPH_PATTERN);
      splitPattern = compilePatternCheck("splitPattern", SPLIT_PATTERN);
      periodPattern = compilePatternCheck("periodPattern", PERIOD_PATTERN);
      acronymPattern = compilePatternCheck("acronymPattern", ACRONYM_PATTERN);
      // vng change end
  }
  /**
   * vng change
   */
  private Pattern compilePatternCheck(String patternKey, String patternDefault) {
    String strPattern = (String) context
        .getConfigParameterValue(patternKey);
    if (strPattern == null)
      strPattern = patternDefault;
    Pattern pat = null;
    try {
      pat = Strings.isNullOrEmpty(strPattern) ? null : Pattern
          .compile(strPattern);
    } catch (PatternSyntaxException pse) {
      logger.warn("ignoring bad pattern, reverting to default: "
          + strPattern, pse);
      pat = Pattern.compile(patternDefault);
    }
    return pat;
  }


  /**
   * Entry point for processing.
   */
  public void process(JCas jcas) throws AnalysisEngineProcessException {


    logger.info("Starting processing.");


    sentenceCount = 0;


    String text = jcas.getDocumentText();


    JFSIndexRepository indexes = jcas.getJFSIndexRepository();
    Iterator<?> sectionItr = indexes.getAnnotationIndex(Segment.type)
        .iterator();
    while (sectionItr.hasNext()) {
      Segment sa = (Segment) sectionItr.next();
      String sectionID = sa.getId();
      if (!skipSegmentsSet.contains(sectionID)) {
        sentenceCount = annotateParagraph(jcas, text, sa.getBegin(),
            sa.getEnd(), sentenceCount);
      }
    }
  }


  /**
   * split paragraphs. Arc v1.0 had a paragraph splitter, and sentences never
   * crossed paragraph boundaries. paragraph splitter was lost in upgrade to
   * ctakes 1.3.2. Now split paragraphs before running through maximum entropy
   * model - this resolves situations where the model would split after a
   * period, e.g.:
   * 
   * <pre>
   * Clinical History:
   * Mr. So and so
   * </pre>
   * 
   * Without the paragraph splitter, the model splits after Mr. With the
   * paragraph splitter, the model doesn't split after Mr.
   * 
   * @param jcas
   * @param text
   * @param b
   * @param e
   * @param sentenceCount
   * @return
   * @throws AnalysisEngineProcessException 
   * @throws AnnotatorProcessException
   */
  protected int annotateParagraph(JCas jcas, String text, int b, int e,
      int sentenceCount) throws AnalysisEngineProcessException {
    if (this.paragraphPattern == null) {
      return this.annotateRange(jcas, text, b, e, sentenceCount);
    } else {
      int lastEnd = b;
      Matcher m = paragraphPattern.matcher(text);
      while (m.find()) {
        if (m.end() > b && m.end() < e) {
          sentenceCount = annotateRange(jcas, text, lastEnd, m.end(),
              sentenceCount);
          lastEnd = m.end();
        } else if (m.end() >= e) {
          break;
        }
      }
      sentenceCount = annotateRange(jcas, text, lastEnd, e, sentenceCount);
      return sentenceCount;
    }
  }


  /**
   * Detect sentences within a section of the text and add annotations to the
   * CAS. Uses OpenNLP sentence detector, and then additionally forces
   * sentences to end at end-of-line characters (splitting into multiple
   * sentences). Also trims sentences. And if the sentence detector does
   * happen to form a sentence that is just white space, it will be ignored.
   * 
   * @param jcas
   *            view of the CAS containing the text to run sentence detector
   *            against
   * @param text
   *            the document text
   * @param section
   *            the section this sentence is in
   * @param sentenceCount
   *            the number of sentences added already to the CAS (if
   *            processing one section at a time)
   * @return count The sum of <code>sentenceCount</code> and the number of
   *         Sentence annotations added to the CAS for this section
   * @throws AnnotatorProcessException
   */
  protected int annotateRange(JCas jcas, String text, int b, int e,
      int sentenceCount) throws AnalysisEngineProcessException {


    // vng change begin
    // int b = section.getBegin();
    // int e = section.getEnd();
    // vng chang end


    // Use OpenNLP tools to split text into sentences
    // The sentence detector returns the offsets of the sentence-endings it
    // detects
    // within the string
    int[] sentenceBreaks = sentenceDetector.sentPosDetect(text.substring(b,
        e)); // OpenNLP tools 1.5 returns Spans rather than offsets that
            // 1.4 did
    int numSentences = sentenceBreaks.length;
    // There might be text after the last sentence-ending found by detector,
    // so +1
    SentenceSpan[] potentialSentSpans = new SentenceSpan[numSentences + 1];


    int sentStart = b;
    int sentEnd = b;
    // Start by filling in sentence spans from what OpenNLP tools detected
    // Will trim leading or trailing whitespace when check for end-of-line
    // characters
    for (int i = 0; i < numSentences; i++) {
      sentEnd = sentenceBreaks[i] + b; // OpenNLP tools 1.5 returns Spans
                        // rather than offsets that 1.4
                        // did
      String coveredText = text.substring(sentStart, sentEnd);
      potentialSentSpans[i] = new SentenceSpan(sentStart, sentEnd,
          coveredText);
      sentStart = sentEnd;
    }


    // If detector didn't find any sentence-endings,
    // or there was text after the last sentence-ending found,
    // create a sentence from what's left, as long as it's not all
    // whitespace.
    // Will trim leading or trailing whitespace when check for end-of-line
    // characters
    if (sentEnd < e) {
      String coveredText = text.substring(sentEnd, e);
      if (coveredText.trim() != "") {
        potentialSentSpans[numSentences] = new SentenceSpan(sentEnd, e,
            coveredText);
        numSentences++;
      }
    }


    // Copy potentialSentSpans into sentenceSpans,
    // ignoring any that are entirely whitespace,
    // trimming the rest,
    // and splitting any of those that contain an end-of-line character.
    // Then trim any leading or trailing whitespace of ones that were split.
    ArrayList<SentenceSpan> sentenceSpans1 = new ArrayList<SentenceSpan>(0);
    for (int i = 0; i < potentialSentSpans.length; i++) {
      if (potentialSentSpans[i] != null) {
        sentenceSpans1.addAll(potentialSentSpans[i]
            .splitAtLineBreaksAndTrim(NEWLINE)); // TODO Determine
                                // line break
                                // type
      }
    }
    // vng change begin
    // split at ".  "
    ArrayList<SentenceSpan> sentenceSpans = new ArrayList<SentenceSpan>(
        sentenceSpans1.size());
    for (SentenceSpan span : sentenceSpans1) {
      if (span != null) {
        sentenceSpans.addAll(span.splitAtPeriodAndTrim(acronymPattern,
            periodPattern, splitPattern));
      }
    }
    // vng change end


    // Add sentence annotations to the CAS
    int previousEnd = -1;
    for (int i = 0; i < sentenceSpans.size(); i++) {
      SentenceSpan span = sentenceSpans.get(i);
      if (span.getStart() != span.getEnd()) { // skip empty lines
        Sentence sa = new Sentence(jcas);
        sa.setBegin(span.getStart());
        sa.setEnd(span.getEnd());
        if (previousEnd <= sa.getBegin()) {
          // System.out.println("Adding Sentence Annotation for " +
          // span.toString());
          sa.setSentenceNumber(sentenceCount);
          sa.addToIndexes();
          sentenceCount++;
          previousEnd = span.getEnd();
        } else {
          logger.error("Skipping sentence from " + span.getStart()
              + " to " + span.getEnd());
          logger.error("Overlap with previous sentence that ended at "
              + previousEnd);
        }
      }
    }
    return sentenceCount;
  }


  /**
   * Train a new sentence detector from the training data in the first file
   * and write the model to the second file.<br>
   * The training data file is expected to have one sentence per line.
   * 
   * @param args
   *            training_data_filename name_of_model_to_create iters? cutoff?
   * @throws IOException
   */
  public static void main(String[] args) throws IOException {
    final Logger logger = Logger.getLogger(SentenceDetector.class.getName()
        + ".main()");


    // Handle arguments
    if (args.length < 2 || args.length > 4) {
      usage(logger);
      System.exit(-1);
    }


    File inFile = getReadableFile(args[0]);


    File outFile = getFileInExistingDir(args[1]);
    // File outFile = new File(args[1]);


    int iters = 100;
    if (args.length > 2) {
      iters = parseInt(args[2], logger);
    }


    int cut = 5;
    if (args.length > 3) {
      cut = parseInt(args[3], logger);
    }


    // Now, do the actual training
    EndOfSentenceScannerImpl scanner = new EndOfSentenceScannerImpl();
    int numEosc = scanner.getEndOfSentenceCharacters().length;


    logger.info("Training new model from " + inFile.getAbsolutePath());
    logger.info("Using " + numEosc + " end of sentence characters.");


    logger.error("----------------------------------------------------------------------------------");
    logger.error("Need to update yet for OpenNLP changes "); // TODO
    logger.error("Commented out code that no longer compiles due to OpenNLP API incompatible changes"); // TODO
    logger.error("----------------------------------------------------------------------------------");
    // GISModel mod = SentenceDetectorME.train(inFile, iters, cut, scanner);
    // SuffixSensitiveGISModelWriter ssgmw = new
    // SuffixSensitiveGISModelWriter(
    // mod, outFile);
    // logger.info("Saving the model as: " + outFile.getAbsolutePath());
    // ssgmw.persist();


  }


  public static void usage(Logger log) {
    log.info("Usage: java "
        + SentenceDetector.class.getName()
        + " training_data_filename name_of_model_to_create <iters> <cut>");
  }


  public static int parseInt(String s, Logger log) {
    try {
      return Integer.parseInt(s);
    } catch (NumberFormatException nfe) {
      log.error("Unable to parse '" + s + "' as an integer.");
      throw (nfe);
    }
  }


  public static File getReadableFile(String fn) throws IOException {
    File f = new File(fn);
    if (!f.canRead()) {
      throw new IOException("Unable to read from file "
          + f.getAbsolutePath());
    }
    return f;
  }


  public static File getFileInExistingDir(String fn) throws IOException {
    File f = new File(fn);
    if (!f.getParentFile().isDirectory()) {
      throw new IOException("Directory not found: "
          + f.getParentFile().getAbsolutePath());
    }
    return f;
  }


}
Source Code of org.apache.ctakes.ytex.uima.annotators.SentenceDetector

Related Classes of org.apache.ctakes.ytex.uima.annotators.SentenceDetector