/***************************************************************************
Clinical Named Entity Recognizer and Normalizer(Clinical NERC), (v0.1).
Copyright (C) 2013 Azad Dehghan
Contact: a.dehghan@manchester.ac.uk
*****************************************************************************/
package main;
import gate.Annotation;
import gate.Factory;
import gate.FeatureMap;
import gate.creole.ResourceInstantiationException;
import io.FileOps;
import io.RecurseFolder;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import classification.MappingPipeline;
import serialize.Serializ;
import stem.Stemmer;
public class Logic {
private static Stemmer stem = new Stemmer();
/**
* construct feature file for CRF++
* @param gateDoc
* @param featureFile
*/
public static void constructFeatureFile(gate.Document gateDoc, ArrayList<String> featureFile)
{
List<Annotation> sentence = new ArrayList<Annotation>( gateDoc.getAnnotations().get("Sentence") );
Collections.sort(sentence, gate.Utils.OFFSET_COMPARATOR);
String aSentence = null;
for(Annotation s: sentence)
{
aSentence = "";
List<Annotation> token = new ArrayList<Annotation>( gateDoc.getAnnotations().get("Token", s.getStartNode().getOffset(), s.getEndNode().getOffset()) );
Collections.sort(token, gate.Utils.OFFSET_COMPARATOR);
for(int i = 0; i < token.size(); i++)
{
FeatureMap tAttributes = token.get(i).getFeatures();
String str = gate.Utils.stringFor(gateDoc, token.get(i));
String orth = ""+tAttributes.get("orth"); if(orth.trim().equals("null")) {orth = "O";}
//construct feature vectors
aSentence += gateDoc.getName()+":"+token.get(i).getId() + "\t" + str +"\t"+ tAttributes.get("category") +
"\t"+ tAttributes.get("chunk") + "\t" + tAttributes.get("kind") + "\t" + orth +
"\t"+ getStem(str) + "\n";
}
featureFile.add(aSentence);
}
featureFile.add("\n");//insert empty line after each document
}
/**
* Porter's stemmer algorithm
* @param term
* @return stem of term
*/
public static String getStem(String term)
{
term = term.toLowerCase();
char[] c = term.toCharArray();
for(int i=0;i<c.length;i++)
{
stem.add(c[i]);
}
stem.stem();
return stem.toString();
}
/**
* process sentence/string from IO/Socket
* @param s text input
* @throws InterruptedException
* @throws IOException
* @throws ResourceInstantiationException
*/
public static String opt1(String s, MLPipeline p, MappingPipeline mp, Boolean nercFlag)
{
gate.Document gateDoc = null;
try {
gateDoc = Factory.newDocument(s);
gateDoc.setName("x");
gateDoc = p.ml(gateDoc, mp, nercFlag);
} catch (Exception e) {
System.err.println("Logic.opt1(..): " +e.getMessage());
}
return Event.getEvents(gateDoc);
}
/**
* process a corpus (text document(s)) and generate GATE xml outputs
* @param source_dir corpus/document(s) to process
* @param op_dir output directory
* @throws InterruptedException
* @throws IOException
* @throws ResourceInstantiationException
*/
public static void opt2(String source_dir, String op_dir, Boolean nercFlag)
{
MLPipeline ml_p = new MLPipeline();
try {
ml_p.ml(source_dir, op_dir, nercFlag);
} catch (Exception e) {
System.err.println("Logic.opt2(..): " + e.getMessage());
}
System.out.println(".opt2.processing/output complete");
}
public static void opt3(String source_dir, String op_dir, Boolean nercFlag)
{
gate.Document gateDoc = null;
MLPipeline ml_p = new MLPipeline();
MappingPipeline mp = null;
try {
if(nercFlag)
mp = new MappingPipeline();
} catch (MalformedURLException e1) {
}
ArrayList<File> fileList = RecurseFolder.getFileList(source_dir);
for(File f: fileList)
{
try {
gateDoc = Factory.newDocument(FileOps.getFileContent(f.toURI().toURL()));
gateDoc.setName(f.getName());
gateDoc = ml_p.ml(gateDoc, mp, nercFlag);
} catch (Exception e) {
System.err.println("Logic.opt3(..): " + e.getMessage());
}
Serializ.serialize(gateDoc, op_dir);
}
Factory.deleteResource(gateDoc);
System.out.println(".opt3.processing/output complete");
}
/**
* process corpus and generate a offset file (MedNERC_offset.txt) with TEid results
* @param source_dir
* @param op_dir
* @throws InterruptedException
* @throws IOException
* @throws ResourceInstantiationException
*/
public static void opt4(String source_dir, String op_dir, Boolean nercFlag)
{
MappingPipeline mp = null;
try {
mp = new MappingPipeline();
} catch (MalformedURLException e1) {
System.err.println("Logic.op4(..): " + e1.getMessage());
}
MLPipeline ml_p = new MLPipeline();
String all_EVENTs = "";
gate.Document gateDoc = null;
ArrayList<File> fileList = RecurseFolder.getFileList(source_dir);
for(File f: fileList)
{
System.out.print("\r.processing: " + f.getName() + "\r");
all_EVENTs += "filename:"+f.getName() + "\n";
try {
gateDoc = Factory.newDocument(FileOps.getFileContent(f.toURI().toURL()));
gateDoc.setName(f.getName());
gateDoc = ml_p.ml(gateDoc, mp, nercFlag);
} catch (Exception e) {
System.err.println("Logic.opt4(..): " +e.getMessage());
}
all_EVENTs += Event.getEvents(gateDoc) + "\n";
}
FileOps.saveFile(op_dir +"/" + "NER_offsets.txt", all_EVENTs);
Factory.deleteResource(gateDoc);
System.out.println(".opt4.processing/output complete");
}
}