* @param op_dir
* @throws Exception
*/
public void ml(String source_dir, String op_dir, Boolean nercFlag) throws Exception
{
MappingPipeline mapping = null;
if(nercFlag)
mapping = new MappingPipeline();
featureFile = new ArrayList<String>();
ArrayList<File> fileList = RecurseFolder.getFileList(source_dir);
for(File f: fileList)
{
System.out.print("\r.processing: " + f.getName() + "\r");
String filename = f.getName().substring(0, f.getName().indexOf("."));
gateDoc = Factory.newDocument(FileOps.getFileContent(f.getAbsolutePath()));
gateDoc = Pipeline.preProcPipeline(gateDoc);
gateDoc.setName(f.getName().replace(".txt", ""));
//construct a feature vector for document f
featureFile.clear();
Logic.constructFeatureFile(gateDoc, featureFile);
for(int i = 0; i < 3; i++)
{
String model = "test";
if (i==1)
{model = "treatment";}
else if(i==2)
{model = "problem";}
String tagged_file = Crf.run("models/"+model + "_model_iobw", featureFile);
String[] aS = tagged_file.split("\n");
for(String s : aS)
{
String[] row = s.split("\t");
String label = row[1];
int tokenId = Integer.parseInt(row[0].substring(row[0].indexOf(":")+1, row[0].length()));
Annotation aToken = gateDoc.getAnnotations().get("Token").get(tokenId);
gate.FeatureMap gateMap = aToken.getFeatures();
gateMap.put("crf_prediction", label + "_" + model);
aToken.setFeatures(gateMap);
}
//post-process:
//-crf_prediction tags per token to final crf_prediction annotation span across 1 or more tokens [Done]
//-remove FP & adjust boundaries [rule set not added yet 20.4.13]
gateDoc = Pipeline.postProcPipeline(gateDoc);
//classification pipeline
if(nercFlag)
gateDoc = mapping.run(gateDoc);
}
//save to op_dir
FileOps.saveFile(op_dir + "/" + filename + ".xml", gateDoc.toXml());
}
Factory.deleteResource(gateDoc);