Package main

Source Code of main.MLPipeline

package main;
/***************************************************************************
Clinical Named Entity Recognizer and Normalizer(Clinical NERC), (v0.1).
Copyright (C) 2013  Azad Dehghan

Contact:  a.dehghan@manchester.ac.uk
*****************************************************************************/
import crfplusplus.Crf;
import gate.Annotation;
import gate.Factory;

import io.FileOps;
import io.RecurseFolder;

import java.io.File;
import java.util.ArrayList;

import classification.MappingPipeline;


public class MLPipeline {

  private static ArrayList<String> featureFile = null;
  private static gate.Document gateDoc = null;
 
  public MLPipeline()
  {
    new Pipeline();
  }
   
     /**
      * ml pipeline
      *
      * @param source_dir
      * @param op_dir
      * @throws Exception
      */
    public void ml(String source_dir, String op_dir, Boolean nercFlag) throws Exception
    {
      MappingPipeline mapping = null;
     
      if(nercFlag)
        mapping = new MappingPipeline();
     
      featureFile = new ArrayList<String>();

      ArrayList<File> fileList = RecurseFolder.getFileList(source_dir);
      for(File f: fileList)
      {
        System.out.print("\r.processing: " + f.getName() + "\r");
        String filename = f.getName().substring(0, f.getName().indexOf("."));
       
        gateDoc = Factory.newDocument(FileOps.getFileContent(f.getAbsolutePath()));
        gateDoc = Pipeline.preProcPipeline(gateDoc);
        gateDoc.setName(f.getName().replace(".txt", ""));
       
        //construct a feature vector for document f
        featureFile.clear();
        Logic.constructFeatureFile(gateDoc, featureFile);
            
       
        for(int i = 0; i < 3; i++)
        {
          String model = "test";
          if (i==1)
            {model = "treatment";}
          else if(i==2)
            {model = "problem";}
         
          String tagged_file = Crf.run("models/"+model + "_model_iobw", featureFile);
          String[] aS = tagged_file.split("\n");   
          for(String s : aS)
          {
            String[] row = s.split("\t");
            String label = row[1];
            int tokenId = Integer.parseInt(row[0].substring(row[0].indexOf(":")+1, row[0].length()));
     
            Annotation aToken = gateDoc.getAnnotations().get("Token").get(tokenId);
            gate.FeatureMap gateMap = aToken.getFeatures();
            gateMap.put("crf_prediction", label + "_" + model);
            aToken.setFeatures(gateMap);
          }
            //post-process:
            //-crf_prediction tags per token to final crf_prediction annotation span across 1 or more tokens [Done]
            //-remove FP & adjust boundaries [rule set not added yet 20.4.13]
          gateDoc = Pipeline.postProcPipeline(gateDoc);
         
          //classification pipeline
          if(nercFlag)
            gateDoc = mapping.run(gateDoc);
        }
        //save to op_dir
        FileOps.saveFile(op_dir + "/" + filename + ".xml", gateDoc.toXml());
      }
    Factory.deleteResource(gateDoc);
   
 
    /**
     * ml pipeline
     * @param gateDoc
     * @return
     * @throws Exception
     */
    public gate.Document ml(gate.Document gateDoc, MappingPipeline mapping, Boolean nercFlag) throws Exception
    {     
      featureFile = new ArrayList<String>();
      gateDoc = Pipeline.preProcPipeline(gateDoc);
       
      //construct a fVector for document i.e., parse pipeline output
      featureFile.clear();
      Logic.constructFeatureFile(gateDoc, featureFile);
     

      for(int i = 0; i < 3; i++)
      {
        String model = "test";
        if (i==1)
          {model = "treatment";}
        else if(i==2)
          {model = "problem";}
         
        String tagged_file = Crf.run("models/"+model + "_model_iobw", featureFile);
        String[] aS = tagged_file.split("\n");   
        for(String s : aS)
        {
          String[] row = s.split("\t");
          String label = row[1];
          int tokenId = Integer.parseInt(row[0].substring(row[0].indexOf(":")+1, row[0].length()));
   
          Annotation aToken = gateDoc.getAnnotations().get("Token").get(tokenId);
          gate.FeatureMap gateMap = aToken.getFeatures();
          gateMap.put("crf_prediction", label + "_" + model);
          aToken.setFeatures(gateMap);
        }
       
         // post-process:
         // -crf_prediction tags per token to final crf_prediction annotation span across 1 or more tokens [Done]
         // -remove FP & adjust boundaries [rule set not added yet 20.4.13]
        gateDoc = Pipeline.postProcPipeline(gateDoc);
       
        //classification pipeline
        if(nercFlag)
          gateDoc = mapping.run(gateDoc);
      }   
      return gateDoc;
    }   
   
    /**
    public static String getFeatureFile(ArrayList<String> fFile)
    {
      String fv = "";
      for(int i=0;i<fFile.size()-1;i++)
      {
        if(i==0 && fFile.get(0).equals("\n")){}
          else{fv += fFile.get(i) + "\n";}
      }
   
      //in case shuffle precedes.
      fv.replaceAll("\n\n\n\n", "\n");
      return fv.replaceAll("\n\n\n", "\n");
    } 
    */
}
TOP

Related Classes of main.MLPipeline

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.