package main;
/***************************************************************************
Clinical Named Entity Recognizer and Normalizer(Clinical NERC), (v0.1).
Copyright (C) 2013 Azad Dehghan
Contact: a.dehghan@manchester.ac.uk
*****************************************************************************/
import crfplusplus.Crf;
import gate.Annotation;
import gate.Factory;
import io.FileOps;
import io.RecurseFolder;
import java.io.File;
import java.util.ArrayList;
import classification.MappingPipeline;
public class MLPipeline {
private static ArrayList<String> featureFile = null;
private static gate.Document gateDoc = null;
public MLPipeline()
{
new Pipeline();
}
/**
* ml pipeline
*
* @param source_dir
* @param op_dir
* @throws Exception
*/
public void ml(String source_dir, String op_dir, Boolean nercFlag) throws Exception
{
MappingPipeline mapping = null;
if(nercFlag)
mapping = new MappingPipeline();
featureFile = new ArrayList<String>();
ArrayList<File> fileList = RecurseFolder.getFileList(source_dir);
for(File f: fileList)
{
System.out.print("\r.processing: " + f.getName() + "\r");
String filename = f.getName().substring(0, f.getName().indexOf("."));
gateDoc = Factory.newDocument(FileOps.getFileContent(f.getAbsolutePath()));
gateDoc = Pipeline.preProcPipeline(gateDoc);
gateDoc.setName(f.getName().replace(".txt", ""));
//construct a feature vector for document f
featureFile.clear();
Logic.constructFeatureFile(gateDoc, featureFile);
for(int i = 0; i < 3; i++)
{
String model = "test";
if (i==1)
{model = "treatment";}
else if(i==2)
{model = "problem";}
String tagged_file = Crf.run("models/"+model + "_model_iobw", featureFile);
String[] aS = tagged_file.split("\n");
for(String s : aS)
{
String[] row = s.split("\t");
String label = row[1];
int tokenId = Integer.parseInt(row[0].substring(row[0].indexOf(":")+1, row[0].length()));
Annotation aToken = gateDoc.getAnnotations().get("Token").get(tokenId);
gate.FeatureMap gateMap = aToken.getFeatures();
gateMap.put("crf_prediction", label + "_" + model);
aToken.setFeatures(gateMap);
}
//post-process:
//-crf_prediction tags per token to final crf_prediction annotation span across 1 or more tokens [Done]
//-remove FP & adjust boundaries [rule set not added yet 20.4.13]
gateDoc = Pipeline.postProcPipeline(gateDoc);
//classification pipeline
if(nercFlag)
gateDoc = mapping.run(gateDoc);
}
//save to op_dir
FileOps.saveFile(op_dir + "/" + filename + ".xml", gateDoc.toXml());
}
Factory.deleteResource(gateDoc);
}
/**
* ml pipeline
* @param gateDoc
* @return
* @throws Exception
*/
public gate.Document ml(gate.Document gateDoc, MappingPipeline mapping, Boolean nercFlag) throws Exception
{
featureFile = new ArrayList<String>();
gateDoc = Pipeline.preProcPipeline(gateDoc);
//construct a fVector for document i.e., parse pipeline output
featureFile.clear();
Logic.constructFeatureFile(gateDoc, featureFile);
for(int i = 0; i < 3; i++)
{
String model = "test";
if (i==1)
{model = "treatment";}
else if(i==2)
{model = "problem";}
String tagged_file = Crf.run("models/"+model + "_model_iobw", featureFile);
String[] aS = tagged_file.split("\n");
for(String s : aS)
{
String[] row = s.split("\t");
String label = row[1];
int tokenId = Integer.parseInt(row[0].substring(row[0].indexOf(":")+1, row[0].length()));
Annotation aToken = gateDoc.getAnnotations().get("Token").get(tokenId);
gate.FeatureMap gateMap = aToken.getFeatures();
gateMap.put("crf_prediction", label + "_" + model);
aToken.setFeatures(gateMap);
}
// post-process:
// -crf_prediction tags per token to final crf_prediction annotation span across 1 or more tokens [Done]
// -remove FP & adjust boundaries [rule set not added yet 20.4.13]
gateDoc = Pipeline.postProcPipeline(gateDoc);
//classification pipeline
if(nercFlag)
gateDoc = mapping.run(gateDoc);
}
return gateDoc;
}
/**
public static String getFeatureFile(ArrayList<String> fFile)
{
String fv = "";
for(int i=0;i<fFile.size()-1;i++)
{
if(i==0 && fFile.get(0).equals("\n")){}
else{fv += fFile.get(i) + "\n";}
}
//in case shuffle precedes.
fv.replaceAll("\n\n\n\n", "\n");
return fv.replaceAll("\n\n\n", "\n");
}
*/
}