Package org.apache.ctakes.assertion.pipelines

Source Code of org.apache.ctakes.assertion.pipelines.GoldEntityAndAttributeReaderPipelineForSeedCorpus

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied.  See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.ctakes.assertion.pipelines;


import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.util.HashMap;

import org.apache.ctakes.assertion.cr.I2B2Challenge2010CollectionReader;
import org.apache.ctakes.assertion.cr.MiPACQKnowtatorXMLReader;
import org.apache.ctakes.assertion.cr.NegExCorpusReader;
import org.apache.ctakes.assertion.pipelines.SharpCorpusSplit.Subcorpus;
import org.apache.ctakes.core.ae.SHARPKnowtatorXMLReader;
import org.apache.ctakes.core.cr.FilesInDirectoryCollectionReader;
import org.apache.log4j.Logger;
import org.apache.uima.UIMAException;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.collection.CollectionReaderDescription;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.metadata.TypeSystemDescription;
import org.uimafit.component.xwriter.XWriter;
import org.uimafit.factory.AggregateBuilder;
import org.uimafit.factory.AnalysisEngineFactory;
import org.uimafit.factory.CollectionReaderFactory;
import org.uimafit.factory.TypeSystemDescriptionFactory;
import org.uimafit.pipeline.SimplePipeline;
import com.google.common.base.Function;

/**
*
* A class for testing the reader for the gold standard relation data.
* Currently this class runs the reader and saves the resulting annotations as xmi files.
*
* @author dmitriy dligach
* @author matt coarr
*
*/
public class GoldEntityAndAttributeReaderPipelineForSeedCorpus {
 
  static final Logger logger = Logger.getLogger(GoldEntityAndAttributeReaderPipelineForSeedCorpus.class.getName());

  public static void main(String[] args) throws UIMAException, IOException {
   
    logger.warn("This should be run with one command-line argument that is the parent UMLS_CEM directory.");
    logger.warn("Also, make sure each ss1_batch* directory has both a Knowtator/text directory and a Knowtator_XML directory (not the underscore in the xml directory, not a space)");
   
    if (args.length != 1)
    {
      System.out.println("Requires one parameter that is the UMLS_CEM main directory (e.g. the \"Seed_Corpus/Mayo/UMLS_CEM\" or \"Seattle Group Health/UMLS_CEM\"). The path should be fully specified.");
    }
   
    String parentDirectoryString = args[0];
    //String parentDirectoryString = "/work/medfacts/sharp/data/2012-10-16_full_data_set_updated/Seed_Corpus/Seattle Group Health/UMLS_CEM";

    File parentDirectory = new File(parentDirectoryString);
    readSharpSeedUmlsCem(parentDirectory);
   
  }

  public static void readSharpSeedUmlsCem(File parentDirectory) throws ResourceInitializationException, UIMAException, IOException {
    readSharpSeedUmlsCem(parentDirectory, null, null, null);
  }
 
  public static void readSharpSeedUmlsCem(File parentDirectory, File trainDirectory, File testDirectory, File devDirectory)
      throws ResourceInitializationException, UIMAException, IOException {
//    logger.info("parent directory: " + parentDirectoryString);
//    File parentDirectory = new File(parentDirectoryString);
    if (!parentDirectory.exists())
    {
      logger.fatal(String.format("parent directory %s does not exist! exiting!", parentDirectory.getAbsolutePath()));
      return;
    }
   
    File batchDirectories[] = parentDirectory.listFiles(new FileFilter() {
     
      @Override
      public boolean accept(File pathname) {
        return pathname.isDirectory();
      }
    });
   
    for (File currentBatchDirectory : batchDirectories)
    {
     
      logger.info("current batch directory: " + currentBatchDirectory.getName());
     
      if (!currentBatchDirectory.exists())
      {
        logger.fatal(String.format("current batch directory does not exist! exiting! [\"%s\"]", currentBatchDirectory.toString()));
        continue;
      }
     
      File knowtatorDirectory = new File(currentBatchDirectory, "Knowtator");
      File textDirectory = new File(knowtatorDirectory, "text");
      // train set uses this naming convention
      File xmlDirectory = new File(currentBatchDirectory, "Knowtator_XML");
      File xmiDirectory = new File(currentBatchDirectory, "Knowtator_XMI");
      // dev and test sets use this naming convention
      if (!xmlDirectory.exists()) {
        xmlDirectory = new File(currentBatchDirectory, "Knowtator XML");
        xmiDirectory = new File(currentBatchDirectory, "Knowtator XMI");
      }
     
      if (!knowtatorDirectory.isDirectory() ||
          !textDirectory.isDirectory() ||
          !xmlDirectory.isDirectory())
      {
        logger.error("one of the directories does not exist! skipping...");
        continue;
      }
     
      if (!xmiDirectory.isDirectory())
      {
        xmiDirectory.mkdir();
      }
       
     
      TypeSystemDescription typeSystemDescription =
          // use the uimafit method of finding available type system
          // descriptor via META-INF/org.uimafit/types.txt
          // (found in ctakes-type-system/src/main/resources)
        TypeSystemDescriptionFactory.createTypeSystemDescription();
     
      AggregateBuilder aggregate = new AggregateBuilder();
     
      CollectionReaderDescription collectionReader = CollectionReaderFactory.createDescription(
          FilesInDirectoryCollectionReader.class,
          typeSystemDescription,
          "InputDirectory",
          textDirectory.toString()
          );
     
      // read the UMLS_CEM data from Knowtator
      AnalysisEngineDescription goldAnnotator = AnalysisEngineFactory.createPrimitiveDescription(
          SHARPKnowtatorXMLReader.class,
          typeSystemDescription,
          "TextDirectory", // 3/13/13 halgrim changed from "TextURI" trying to work with new SHARPKnowtatorXMLReader.java
          //"/work/medfacts/sharp/data/2012-10-16_full_data_set_updated/Seed_Corpus/sandbox/batch02_mayo/knowtator/"
          textDirectory.toString() + "/"
      );
      aggregate.add(goldAnnotator);

      // write just the XMI version of what's in Knowtator UMLS_CEM
      AnalysisEngineDescription xWriter = AnalysisEngineFactory.createPrimitiveDescription(
          XWriter.class,
          typeSystemDescription,
          XWriter.PARAM_OUTPUT_DIRECTORY_NAME,
          xmiDirectory.toString(),
          XWriter.PARAM_FILE_NAMER_CLASS_NAME,
          CtakesFileNamer.class.getName()
      );
      aggregate.add(xWriter);

      // fill in other values that are necessary for preprocessing
      AnalysisEngineDescription preprocessAnnotator = AnalysisEngineFactory.createAnalysisEngineDescription(
          "desc/analysis_engine/AttributeDiscoveryPreprocessor"
          );
      aggregate.add(preprocessAnnotator);
     
      if (trainDirectory!=null && testDirectory!=null && devDirectory!=null) {
        File subcorpusDirectory;
        switch (SharpCorpusSplit.splitSeed(currentBatchDirectory)) {
        case TRAIN:
          subcorpusDirectory = trainDirectory;
          break;
        case TEST:
          subcorpusDirectory = testDirectory;
          break;
        case DEV:
          subcorpusDirectory = devDirectory;
          break;
        case CROSSVAL:
          subcorpusDirectory = trainDirectory;
          break;
        default:
          subcorpusDirectory = trainDirectory;
          break;
        }
        AnalysisEngineDescription xWriter2 = AnalysisEngineFactory.createPrimitiveDescription(
            XWriter.class,
            typeSystemDescription,
            XWriter.PARAM_OUTPUT_DIRECTORY_NAME,
            subcorpusDirectory,
            XWriter.PARAM_FILE_NAMER_CLASS_NAME,
            CtakesFileNamer.class.getName()
        );
        aggregate.add(xWriter2);
//        SimplePipeline.runPipeline(collectionReader, goldAnnotator, xWriter, xWriter2);
      }

      SimplePipeline.runPipeline(collectionReader, aggregate.createAggregateDescription());
    }

    logger.info("Finished!");
  }
 
  public static void readSharpStratifiedUmls(File releaseDirectory, File trainDirectory, File testDirectory, File devDirectory) throws UIMAException, IOException{
    File mayoStrat = new File(releaseDirectory, "SHARP/MayoStrat/by-batch/umls");
    // sghStrat not annotated yet...
    File sghStrat = new File(releaseDirectory, "SHARP/SGHStrat1/by-batch/umls");
   
    readSharpUmls(new File[] {mayoStrat, sghStrat}, trainDirectory, testDirectory, devDirectory,
        new Function<File,Subcorpus>(){
      public Subcorpus apply(File f){
        return SharpCorpusSplit.splitStratified(f);
    }
    });
  }
 
  public static void readSharpSeedUmls(File releaseDirectory, File trainDirectory, File testDirectory, File devDirectory) throws UIMAException, IOException{
    File seed1 = new File(releaseDirectory, "SHARP/SeedSet1/by-batch/umls");
    readSharpUmls(new File[] {seed1}, trainDirectory, testDirectory, devDirectory,
         new Function<File,Subcorpus>(){
        public Subcorpus apply(File f){
          return SharpCorpusSplit.splitSeed(f);
      }
      });

  }
 
  public static void readSharpUmls(File[] sections, File trainDirectory, File testDirectory, File devDirectory, Function<File,Subcorpus> splitFunction) throws UIMAException, IOException{
    for(File section : sections){
      File[] batches = section.listFiles(new FileFilter(){

        @Override
        public boolean accept(File pathname) {
          return pathname.isDirectory();
        }});
      for(File batchDir : batches){
        TypeSystemDescription typeSystemDescription =
            // use the uimafit method of finding available type system
            // descriptor via META-INF/org.uimafit/types.txt
            // (found in ctakes-type-system/src/main/resources)
          TypeSystemDescriptionFactory.createTypeSystemDescription();
       
        File textDirectory = new File(batchDir, "text");
        AggregateBuilder aggregate = new AggregateBuilder();
       
        CollectionReaderDescription collectionReader = CollectionReaderFactory.createDescription(
            FilesInDirectoryCollectionReader.class,
            typeSystemDescription,
            "InputDirectory",
            textDirectory.toString()
            );
       
        // read the UMLS_CEM data from Knowtator
        AnalysisEngineDescription goldAnnotator = AnalysisEngineFactory.createPrimitiveDescription(
            SHARPKnowtatorXMLReader.class,
            typeSystemDescription,
            "TextDirectory", // 3/13/13 halgrim changed from "TextURI" trying to work with new SHARPKnowtatorXMLReader.java
            //"/work/medfacts/sharp/data/2012-10-16_full_data_set_updated/Seed_Corpus/sandbox/batch02_mayo/knowtator/"
            textDirectory.toString() + "/"
        );
        aggregate.add(goldAnnotator);

        // fill in other values that are necessary for preprocessing
        AnalysisEngineDescription preprocessAnnotator = AnalysisEngineFactory.createAnalysisEngineDescription(
            "desc/analysis_engine/AttributeDiscoveryPreprocessor"
            );
        aggregate.add(preprocessAnnotator);

        File subcorpusDir = null;
//        Subcorpus subcorpus = SharpCorpusSplit.splitStratified(Integer.parseInt(batchDir.getName()));
        Subcorpus subcorpus = splitFunction.apply(batchDir);
        switch(subcorpus){
        case TRAIN:
          subcorpusDir = trainDirectory;
          break;
        case DEV:
          subcorpusDir = devDirectory;
          break;
        case TEST:
          subcorpusDir = testDirectory;
          break;
        default:
          subcorpusDir = trainDirectory;
        }
       
         AnalysisEngineDescription xWriter = AnalysisEngineFactory.createPrimitiveDescription(
              XWriter.class,
              typeSystemDescription,
              XWriter.PARAM_OUTPUT_DIRECTORY_NAME,
              subcorpusDir,
              XWriter.PARAM_FILE_NAMER_CLASS_NAME,
              CtakesFileNamer.class.getName()
          );
         aggregate.add(xWriter);
         SimplePipeline.runPipeline(collectionReader, aggregate.createAggregateDescription());

      }
    }
  }

  public static void readI2B2Challenge2010(File parentDirectory, File preprocessedDirectory)
  throws ResourceInitializationException, UIMAException, IOException {

    TypeSystemDescription typeSystemDescription =
      // use the uimafit method of finding available type system
      // descriptor via META-INF/org.uimafit/types.txt
      // (found in ctakes-type-system/src/main/resources)
      TypeSystemDescriptionFactory.createTypeSystemDescription();

    AggregateBuilder aggregate = new AggregateBuilder();

    CollectionReaderDescription collectionReader = CollectionReaderFactory.createDescription(
        I2B2Challenge2010CollectionReader.class,
        typeSystemDescription,
        "inputDir",
        parentDirectory
    );

    // fill in other values that are necessary for preprocessing
    AnalysisEngineDescription preprocessAnnotator = AnalysisEngineFactory.createAnalysisEngineDescription(
        "desc/analysis_engine/AttributeDiscoveryPreprocessor"
    );
    aggregate.add(preprocessAnnotator);

    if (preprocessedDirectory!=null) {
      AnalysisEngineDescription xWriter2 = AnalysisEngineFactory.createPrimitiveDescription(
          XWriter.class,
          typeSystemDescription,
          XWriter.PARAM_OUTPUT_DIRECTORY_NAME,
          preprocessedDirectory,
          XWriter.PARAM_FILE_NAMER_CLASS_NAME,
          CtakesFileNamer.class.getName()
      );
      aggregate.add(xWriter2);
      //    SimplePipeline.runPipeline(collectionReader, goldAnnotator, xWriter, xWriter2);
    }

    SimplePipeline.runPipeline(collectionReader, aggregate.createAggregateDescription());
    logger.info("Finished!");
  }

  public static void readNegexTestSet(File inputFile, File preprocessedDirectory)
  throws ResourceInitializationException, UIMAException, IOException {

    TypeSystemDescription typeSystemDescription =
      TypeSystemDescriptionFactory.createTypeSystemDescription();

    AggregateBuilder aggregate = new AggregateBuilder();

    // input dir is hard-coded in AssertionConst
    CollectionReaderDescription collectionReader = CollectionReaderFactory.createDescription(
        NegExCorpusReader.class,
        typeSystemDescription
    );

    // fill in other values that are necessary for preprocessing
    AnalysisEngineDescription preprocessAnnotator = AnalysisEngineFactory.createAnalysisEngineDescription(
        "desc/analysis_engine/AttributeDiscoveryPreprocessor"
    );
    aggregate.add(preprocessAnnotator);

    if (preprocessedDirectory!=null) {
      AnalysisEngineDescription xWriter2 = AnalysisEngineFactory.createPrimitiveDescription(
          XWriter.class,
          typeSystemDescription,
          XWriter.PARAM_OUTPUT_DIRECTORY_NAME,
          preprocessedDirectory,
          XWriter.PARAM_FILE_NAMER_CLASS_NAME,
          CtakesFileNamer.class.getName()
      );
      aggregate.add(xWriter2);
      //    SimplePipeline.runPipeline(collectionReader, goldAnnotator, xWriter, xWriter2);
    }

    SimplePipeline.runPipeline(collectionReader, aggregate.createAggregateDescription());
    logger.info("Finished!");
  }

  public static void readMiPACQ(File inputDirectory, File preprocessedDirectory, File testDirectory, File devDirectory)
  throws ResourceInitializationException, UIMAException, IOException {

    TypeSystemDescription typeSystemDescription =
      TypeSystemDescriptionFactory.createTypeSystemDescription();

    HashMap<File,File> splitMipacq = new HashMap<File,File>();
    splitMipacq.put(new File(inputDirectory+"/text/train"), preprocessedDirectory);
    splitMipacq.put(new File(inputDirectory+"/text/test"),  testDirectory);
    splitMipacq.put(new File(inputDirectory+"/text/dev"),   devDirectory);
    for (File inDir : splitMipacq.keySet() ) {
      AggregateBuilder aggregate = new AggregateBuilder();

      CollectionReaderDescription collectionReader = CollectionReaderFactory.createDescription(
          FilesInDirectoryCollectionReader.class,
          typeSystemDescription,
          "InputDirectory",
          inDir
          );

      // read the UMLS_CEM data from Knowtator
      AnalysisEngineDescription goldAnnotator = AnalysisEngineFactory.createPrimitiveDescription(
          MiPACQKnowtatorXMLReader.class,
          typeSystemDescription,
          MiPACQKnowtatorXMLReader.PARAM_TEXT_DIRECTORY,
          inDir
          );

      aggregate.add(goldAnnotator);
      // fill in other values that are necessary for preprocessing
      AnalysisEngineDescription preprocessAnnotator = AnalysisEngineFactory.createAnalysisEngineDescription(
          "desc/analysis_engine/AttributeDiscoveryPreprocessor"
          );
      aggregate.add(preprocessAnnotator);

      if (preprocessedDirectory!=null) {
        AnalysisEngineDescription xWriter2 = AnalysisEngineFactory.createPrimitiveDescription(
            XWriter.class,
            typeSystemDescription,
            XWriter.PARAM_OUTPUT_DIRECTORY_NAME,
            splitMipacq.get(inDir),
            XWriter.PARAM_FILE_NAMER_CLASS_NAME,
            CtakesFileNamer.class.getName()
            );
        aggregate.add(xWriter2);
        //    SimplePipeline.runPipeline(collectionReader, goldAnnotator, xWriter, xWriter2);
      }

      SimplePipeline.runPipeline(collectionReader, aggregate.createAggregateDescription());
    }
     
    logger.info("Finished!");
  }

}
TOP

Related Classes of org.apache.ctakes.assertion.pipelines.GoldEntityAndAttributeReaderPipelineForSeedCorpus

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.