Package org.apache.uima.examples.casMultiplier

Source Code of org.apache.uima.examples.casMultiplier.SimpleTextMerger

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied.  See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.uima.examples.casMultiplier;

import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;

import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasMultiplier_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.AbstractCas;
import org.apache.uima.cas.FSIndex;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.FeatureStructure;
import org.apache.uima.cas.Type;
import org.apache.uima.examples.SourceDocumentInformation;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.CasCopier;

/**
* An example CasMultiplier, which merges text documents into larger ones. It attempts to merge all
* of the segments that came from one original artifact. This is done by checking the "lastSegment"
* feature of the SourceDocumentInformation FeatureStructure, which is expected to be populated by
* the CollectionReader or CasMultiplier that produced the input CASes.
* <p>
* Limitations: if the lastSegment feature is never set to true by the component producing the input
* CASes, the merger will never produce any output. Also, this implementation relies on the CASes
* arriving in order, which could be a problem in a mulithreaded framework implementation. The order
* requirement could be relieved by recording a segment number in the SourceDocumentInformation, but
* that would also make this example more complicated.
*/
public class SimpleTextMerger extends JCasMultiplier_ImplBase {
 
  public static final String MESSAGE_DIGEST = "org.apache.uima.examples.casMultiplier.ExampleCasMultiplierMessages";
 
  public static final String MISSING_SOURCE_DOCUMENT_INFO = "missing_source_document_info";
 
  public static final String NO_NEXT_CAS = "no_next_cas";
 
  private StringBuffer mDocBuf = new StringBuffer();

  private JCas mMergedCas;

  private boolean mReadyToOutput = false;

  private String[] mAnnotationTypesToCopy;

  /*
   * (non-Javadoc)
   *
   * @see org.apache.uima.analysis_component.AnalysisComponent_ImplBase#initialize(org.apache.uima.UimaContext)
   */
  public void initialize(UimaContext aContext) throws ResourceInitializationException {
    super.initialize(aContext);
    mAnnotationTypesToCopy = (String[]) aContext.getConfigParameterValue("AnnotationTypesToCopy");
  }

  /*
   * (non-Javadoc)
   *
   * @see JCasMultiplier_ImplBase#process(JCas)
   */
  public void process(JCas aJCas) throws AnalysisEngineProcessException {
    // procure a new CAS if we don't have one already
    if (mMergedCas == null) {
      mMergedCas = getEmptyJCas();
    }

    // append document text
    String docText = aJCas.getDocumentText();
    int prevDocLen = mDocBuf.length();
    mDocBuf.append(docText);

    // copy specified annotation types
    CasCopier copier = new CasCopier(aJCas.getCas(), mMergedCas.getCas());
    Set copiedIndexedFs = new HashSet(); // needed in case one annotation is in two indexes (could
    // happen if specified annotation types overlap)
    for (int i = 0; i < mAnnotationTypesToCopy.length; i++) {
      Type type = mMergedCas.getTypeSystem().getType(mAnnotationTypesToCopy[i]);
      FSIndex index = aJCas.getCas().getAnnotationIndex(type);
      Iterator iter = index.iterator();
      while (iter.hasNext()) {
        FeatureStructure fs = (FeatureStructure) iter.next();
        if (!copiedIndexedFs.contains(fs)) {
          Annotation copyOfFs = (Annotation) copier.copyFs(fs);
          // update begin and end
          copyOfFs.setBegin(copyOfFs.getBegin() + prevDocLen);
          copyOfFs.setEnd(copyOfFs.getEnd() + prevDocLen);
          mMergedCas.addFsToIndexes(copyOfFs);
          copiedIndexedFs.add(fs);
        }
      }
    }

    // get the SourceDocumentInformation FS, which indicates the sourceURI of the document
    // and whether the incoming CAS is the last segment
    FSIterator it = aJCas.getAnnotationIndex(SourceDocumentInformation.type).iterator();
    if (!it.hasNext()) {
      throw new AnalysisEngineProcessException(MESSAGE_DIGEST, MISSING_SOURCE_DOCUMENT_INFO,
              new Object[0]);
    }
    SourceDocumentInformation sourceDocInfo = (SourceDocumentInformation) it.next();
    if (sourceDocInfo.getLastSegment()) {
      // time to produce an output CAS
      // set the document text
      mMergedCas.setDocumentText(mDocBuf.toString());

      // add source document info to destination CAS
      SourceDocumentInformation destSDI = new SourceDocumentInformation(mMergedCas);
      destSDI.setUri(sourceDocInfo.getUri());
      destSDI.setOffsetInSource(0);
      destSDI.setLastSegment(true);
      destSDI.addToIndexes();

      mDocBuf = new StringBuffer();
      mReadyToOutput = true;
    }
  }

  /*
   * (non-Javadoc)
   *
   * @see org.apache.uima.analysis_component.AnalysisComponent#hasNext()
   */
  public boolean hasNext() throws AnalysisEngineProcessException {
    return mReadyToOutput;
  }

  /*
   * (non-Javadoc)
   *
   * @see org.apache.uima.analysis_component.AnalysisComponent#next()
   */
  public AbstractCas next() throws AnalysisEngineProcessException {
    if (!mReadyToOutput) {
      throw new AnalysisEngineProcessException(MESSAGE_DIGEST, NO_NEXT_CAS, new Object[0]);
    }
    JCas casToReturn = mMergedCas;
    mMergedCas = null;
    mReadyToOutput = false;
    return casToReturn;
  }

}
TOP

Related Classes of org.apache.uima.examples.casMultiplier.SimpleTextMerger

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.