Source Code of org.apache.ctakes.assertion.medfacts.Converti2b2AnnotationsToCTAKES

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.ctakes.assertion.medfacts;


import org.mitre.medfacts.zoner.CharacterOffsetToLineTokenConverter;
import org.mitre.medfacts.zoner.CharacterOffsetToLineTokenConverterDefaultImpl;
import java.io.File;
import java.io.FileInputStream;


import org.apache.log4j.Logger;
import org.apache.uima.cas.impl.XmiCasDeserializer;
import org.xml.sax.SAXException;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.util.CasCreationUtils;
import org.apache.uima.analysis_engine.AnalysisEngineDescription; 
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS; 
import org.apache.uima.cas.CASException;
import org.apache.uima.jcas.JCas;
import org.apache.uima.cas.TypeSystem; 
import java.util.Collections; 
import org.apache.uima.util.XMLParser;
import org.apache.uima.UIMAFramework;
import org.apache.uima.util.XMLInputSource;
import org.apache.uima.jcas.cas.FSArray;
import org.apache.uima.jcas.cas.Sofa; 
import org.apache.uima.resource.metadata.TypeSystemDescription;
import org.apache.uima.util.InvalidXMLException;
import org.apache.uima.resource.ResourceInitializationException;
import org.mitre.medfacts.i2b2.annotation.Annotation;
import org.mitre.medfacts.i2b2.annotation.ConceptType;
import org.apache.uima.cas.text.AnnotationIndex;
//import org.apache.uima.jcas.tcas.Annotation;


import org.mitre.medfacts.zoner.LineAndTokenPosition;
import java.io.PrintWriter;
import java.io.IOException;
import org.mitre.medfacts.i2b2.api.SingleDocumentProcessor;
import org.mitre.medfacts.i2b2.util.StringHandling;
import org.mitre.medfacts.zoner.LineTokenToCharacterOffsetConverter;
import org.mitre.medfacts.i2b2.processors.AssertionFileProcessor;
import java.util.List;


import org.apache.ctakes.assertion.medfacts.types.Assertion;
import org.apache.ctakes.assertion.medfacts.types.Concept;
import org.apache.ctakes.typesystem.type.textsem.EntityMention;
import org.apache.ctakes.typesystem.type.textsem.EventMention;
import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
import org.apache.ctakes.typesystem.type.textspan.Sentence;
import org.apache.ctakes.typesystem.type.syntax.WordToken;
import org.apache.uima.jcas.JCas;
import java.io.FileOutputStream;
import org.apache.uima.cas.impl.XmiCasSerializer;
import org.apache.uima.util.XMLSerializer;






/*
 * Comments on using this class:
 * This provides a utility to convert i2b2 format sentence and tokens to cTAKES Sentence and WordToken
 * annotations.  It takes as input the input text file, as well as the assertion annotations (the .ast file).
 * It also takes the pipeline descriptor (the top-level descriptor used for the entire pipeline) and an output
 * file path.  The potentially tricky part of this process involves setting up the descriptor so that 
 * it refers to the correct sub-descriptors, etc.  This probably won't be a problem with a full install
 * of the pipeline, but could be an issue in a development environment - some paths may need to be manually
 * (and temporarily) adjusted).
 */


public class Converti2b2AnnotationsToCTAKES {
  private static Logger logger = Logger.getLogger(Converti2b2AnnotationsToCTAKES.class);
  
  private static CAS getTypeSystemFromDescriptor(String descriptor) throws InvalidXMLException, IOException, ResourceInitializationException, CASException {
    XMLParser xmlParser = UIMAFramework.getXMLParser();
    AnalysisEngineDescription tsDesc = xmlParser.parseAnalysisEngineDescription(new XMLInputSource(descriptor));
    return CasCreationUtils.createCas(tsDesc);
  }
  
  /**
     * Serialize a CAS to a file in XMI format
     * 
     * @param aCas
     *          CAS to serialize
     * @param name
     *          output file
     * @throws SAXException
     * @throws Exception
     * 
     * @throws ResourceProcessException
     */
    private static void writeXmi(CAS aCas, File name) throws IOException, SAXException {
      FileOutputStream out = null;


      try {
        // write XMI
        out = new FileOutputStream(name);
        XmiCasSerializer ser = new XmiCasSerializer(aCas.getTypeSystem());
        XMLSerializer xmlSer = new XMLSerializer(out, false);
        ser.serialize(aCas, xmlSer.getContentHandler());
      } finally {
        if (out != null) {
          out.close();
        }
      }
    }
  
  public static void main(String [] args) throws IOException, InvalidXMLException, CASException, SAXException, AnalysisEngineProcessException {
    //File currentTextFile = new File(args[0]);
    File assertionDir = new File(args[1]);
    File dir = new File(args[0]);
    File odir = new File(args[3]);
    String desc = args[2];


    for (File file : dir.listFiles()) {
      CAS cas = null;
      try {
        cas = getTypeSystemFromDescriptor(desc);
      } catch (Exception e) {
        throw new RuntimeException(e);
      }


      String contents = StringHandling.readEntireContents(file);
      String fname = file.getName();
      String outFilePath = odir + "/" + file.getName().substring(0, fname.length() - 3) + "xmi";
      File outFile = new java.io.File(outFilePath);
      String assertionFilePath = assertionDir + "/" + file.getName().substring(0, fname.length() - 3) + "ast";
      System.out.println("Assertion file path: " + assertionFilePath);
      File assertionFile = new java.io.File(assertionFilePath);
      LineTokenToCharacterOffsetConverter converter = new LineTokenToCharacterOffsetConverter(contents);
      AssertionFileProcessor assertionProcessor = new AssertionFileProcessor();
      List<Annotation> assertions = assertionProcessor.processAnnotationFile(assertionFile);


      JCas jcas = cas.getJCas();
      SingleDocumentProcessor p = new SingleDocumentProcessor();
      p.setContents(contents);
      p.preprocess();
      String [][] tokenArrays = p.getTokenArrays();
      jcas.setSofaDataString(contents, "");
      int sentNum = 0;
      int tokNum = 0;
      for (int i=0; i < tokenArrays.length; i++) {
        Sentence sent = new Sentence(jcas);
        LineAndTokenPosition sentStart = new LineAndTokenPosition();
        sentStart.setLine(i+1);
        sentStart.setTokenOffset(0);
        LineAndTokenPosition sentEnd = new LineAndTokenPosition();
        sentEnd.setLine(i+1);
        sentEnd.setTokenOffset(tokenArrays[i].length-1);
        LineTokenToCharacterOffsetConverter.BeginAndEndCharacterOffsetPair sStart = converter.convert(sentStart);
        LineTokenToCharacterOffsetConverter.BeginAndEndCharacterOffsetPair sEnd = converter.convert(sentEnd);
        if ((sStart == null) || (sEnd == null)) {
          sent.setBegin(0);
          sent.setEnd(0);
          sent.setSentenceNumber(sentNum);
        } else {
          sent.setBegin(sStart.getBegin()); // get begin of first token
          sent.setEnd(sEnd.getEnd() + 1); // get end of last token
          sent.setSentenceNumber(sentNum);
        }
        sentNum++;
        sent.addToIndexes();
        for (int j=0; j < tokenArrays[i].length; j++) {
          WordToken tok = new WordToken(jcas);
          LineAndTokenPosition word = new LineAndTokenPosition();
          word.setLine(i+1);
          word.setTokenOffset(j);
          LineTokenToCharacterOffsetConverter.BeginAndEndCharacterOffsetPair tPos = converter.convert(word);
          if (tPos == null) {
            tok.setBegin(0);
            tok.setEnd(1);
          } else {
            tok.setBegin(tPos.getBegin());
            tok.setEnd(tPos.getEnd() + 1);
          }
          tok.setTokenNumber(tokNum);
          tokNum++;  
          tok.addToIndexes();
        }
      }
      logger.info("before assertions");
      for (Annotation a : assertions) {
        logger.info("  begin assertion");
        logger.info("  assertion: " + a.toString());
        //Concept assertion = new Concept(jcas);


        org.mitre.medfacts.i2b2.annotation.AssertionAnnotation i2b2Assertion = (org.mitre.medfacts.i2b2.annotation.AssertionAnnotation)a;
        ConceptType conceptType = i2b2Assertion.getConceptType();


        IdentifiedAnnotation entityOrEventMention = null;
        if (conceptType.equals(ConceptType.TREATMENT))
        {
          entityOrEventMention = new EventMention(jcas);
        } else
        {
          entityOrEventMention = new EntityMention(jcas);
        }
        
        LineAndTokenPosition assertionStart = new LineAndTokenPosition();
        LineAndTokenPosition assertionEnd = new LineAndTokenPosition();
        assertionStart.setLine(a.getBegin().getLine());
        assertionStart.setTokenOffset(a.getBegin().getTokenOffset());
        assertionEnd.setLine(a.getEnd().getLine());
        assertionEnd.setTokenOffset(a.getEnd().getTokenOffset());
        
//        assertion.setBegin(converter.convert(assertionStart).getBegin());
//        assertion.setEnd(converter.convert(assertionEnd).getEnd() + 1);
//        assertion.setConceptType("PROBLEM");
//        assertion.addToIndexes();
        
        entityOrEventMention.setBegin(converter.convert(assertionStart).getBegin());
        entityOrEventMention.setEnd(converter.convert(assertionEnd).getEnd());
        entityOrEventMention.setConfidence(1.0f);
        
        FSArray ontologyConceptArray = ConceptLookup.reverseLookup(conceptType, jcas);
        entityOrEventMention.setOntologyConceptArr(ontologyConceptArray);
        
        //adjustAssertionAttributesByI2B2Convertion(entityOrEventMention, i2b2Assertion);
        AssertionAnalysisEngine.mapI2B2AssertionValueToCtakes(i2b2Assertion.getAssertionValue().toString().toLowerCase(), entityOrEventMention);
        
        entityOrEventMention.addToIndexes();
        logger.info("  end assertion");
      }
      logger.info("after assertions");
      writeXmi(cas,outFile);
    }
  }


  private static void adjustAssertionAttributesByI2B2Convertion(
      IdentifiedAnnotation entityOrEventMention,
      org.mitre.medfacts.i2b2.annotation.AssertionAnnotation i2b2Assertion)
  {
    
  }
}
Source Code of org.apache.ctakes.assertion.medfacts.Converti2b2AnnotationsToCTAKES

Related Classes of org.apache.ctakes.assertion.medfacts.Converti2b2AnnotationsToCTAKES