package net.sf.nlpshell;
import java.io.File;
import java.io.FileInputStream;
import java.io.FilenameFilter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import javax.xml.bind.JAXBContext;
import javax.xml.bind.JAXBElement;
import javax.xml.bind.Unmarshaller;
import javax.xml.transform.stream.StreamSource;
import jcolibri.extensions.textual.wordnet.WordNetBridge;
import net.didion.jwnl.data.IndexWord;
import net.didion.jwnl.dictionary.Dictionary;
import net.sf.nlpshell.domain.PartOfSpeech;
import net.sf.nlpshell.domain.Predicates;
import net.sf.nlpshell.domain.SemanticFrame;
import net.sf.nlpshell.domain.SemanticFrame.FrameItem;
import net.sf.nlpshell.domain.ThematicRoles;
import net.sf.nlpshell.domain.word.verb.Verb;
import opennlp.tools.coref.DefaultLinker;
import opennlp.tools.coref.DiscourseEntity;
import opennlp.tools.coref.Linker;
import opennlp.tools.coref.LinkerMode;
import opennlp.tools.coref.mention.DefaultParse;
import opennlp.tools.coref.mention.Mention;
import opennlp.tools.parser.AbstractBottomUpParser;
import opennlp.tools.parser.Parse;
import opennlp.tools.parser.Parser;
import opennlp.tools.parser.ParserFactory;
import opennlp.tools.parser.ParserModel;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.Span;
import org.apache.commons.io.filefilter.WildcardFileFilter;
import vn.DESCRIPTION;
import vn.EXAMPLE;
import vn.FRAME;
import vn.MEMBER;
import vn.NP;
import vn.PRED;
import vn.SEMANTICS;
import vn.VERB;
import vn.VNCLASS;
import vn.VNSUBCLASS;
public class Main {
private static Pattern untokenizedParenPattern1 = Pattern
.compile("([^ ])([({)}])");
private static Pattern untokenizedParenPattern2 = Pattern
.compile("([({)}])([^ ])");
/**
* @param args
*/
public static void main(String[] args) throws Exception {
Main mySelf = new Main();
mySelf.doIt();
}
public void doIt() throws Exception {
// Workaround per jwnl la versione di wordnet impacchettata da jcolibri
// e' per windows
//System.setProperty("os.name", "windows");
System.setProperty("WNSEARCHDIR", new File ("WordNet-3.0/dict").getAbsolutePath());
WordNetBridge.init();
// String paragraph =
// "This isn't the greatest example sentence in the world because I've seen better. Neither is this one. This one's not bad, though.";
// String paragraph = "Linda gave a student a book";
String paragraph = "can you find a plain-text file that is called \u201Cpippo\u201D ?";
int numParses = 5;
// the sentence detector and tokenizer constructors
// take paths to their respective models
SentenceDetectorME sdetector = new SentenceDetectorME(
new SentenceModel(new FileInputStream(
"models/en-sent.bin")));
Tokenizer tokenizer = new TokenizerME(new TokenizerModel(
new FileInputStream("models/en-token.bin")));
// the parser takes the path to the parser models
// directory and a few other options
/*
* boolean useTagDict = true; boolean useCaseInsensitiveTagDict = false;
* int beamSize = opennlp.tools.parser.chunking.Parser.defaultBeamSize;
* double advancePercentage =
* opennlp.tools.parser.chunking.Parser.defaultAdvancePercentage;
* opennlp.tools.parser.Parser parser = TreebankParser.getParser(
* "models/parser", useTagDict, useCaseInsensitiveTagDict, beamSize,
* advancePercentage);
*/Parser parser = ParserFactory.create(new ParserModel(
new FileInputStream("models/en-parser-chunking.bin")),
AbstractBottomUpParser.defaultBeamSize,
AbstractBottomUpParser.defaultAdvancePercentage);
// break a paragraph into sentences
String[] sents = sdetector.sentDetect(paragraph.toString());
// TODO handle paragraph (multiple sentences)
String sent = sents[0];
// tokenize brackets and parentheses by putting a space on either side.
// this makes sure it doesn't get confused with output from the parser
sent = untokenizedParenPattern1.matcher(sent).replaceAll("$1 $2");
sent = untokenizedParenPattern2.matcher(sent).replaceAll("$1 $2");
// get the tokenizer to break apart the sentence
String[] tokens = tokenizer.tokenize(sent);
// build a string to parse as well as a list of tokens
StringBuffer sb = new StringBuffer();
List<String> tokenList = new ArrayList<String>();
for (int j = 0; j < tokens.length; j++) {
String tok = convertToken(tokens[j]);
tokenList.add(tok);
sb.append(tok).append(" ");
}
String text = sb.substring(0, sb.length() - 1).toString();
// the parent parse instance spans the entire sentence
Parse p = new Parse(text, new Span(0, text.length()), "INC", 1.0, null);
// create a parse object for each token and add it to the parent
int start = 0;
for (final String tok : tokenList) {
p.insert(new Parse(text, new Span(start, start + tok.length()),
opennlp.tools.parser.treeinsert.Parser.TOK_NODE, 1.0, 0));
start += tok.length() + 1;
}
// fetch multiple possible parse trees
Parse[] parses = parser.parse(p, numParses);
Parse chosen_parse = parses[0 /* TODO handle other parse trees */];
for (Parse parse : parses) {
System.out.print("Prob[" + parse.getProb() + "] : ");
parse.show();
System.out.println(chosen_parse);
}
List<Mention> document = new ArrayList<Mention>(1);
ArrayList<opennlp.tools.parser.Parse> parsedSentences = new ArrayList<opennlp.tools.parser.Parse>(
1);
parsedSentences.add(chosen_parse);
Linker treebankLinker = new DefaultLinker(
// LinkerMode should be TEST
//Note: I tried LinkerMode.EVAL for a long time
// before realizing that this was the problem
"models/coref", LinkerMode.TEST);
int sentenceNumber = 0;
Mention[] extents = treebankLinker.getMentionFinder().getMentions(
new DefaultParse(chosen_parse, sentenceNumber));
// construct new parses for mentions which don't have constituents.
// for (int ei = 0, en = extents.length; ei < en; ei++) {
for (final Mention mention : extents) {
// System.err.println("PennTreebankLiner.main: "+ei+" "+extents[ei]);
// if (extents[ei].getParse() == null) {
if (mention.getParse() == null) {
Parse snp = new Parse(p.getText(), mention.getSpan(), "NML",
1.0, null);
p.insert(snp);
mention.setParse(new DefaultParse(snp, sentenceNumber));
}
}
document.addAll(Arrays.asList(extents));
if (document.size() > 0) {
DiscourseEntity[] entities = treebankLinker.getEntities(document
.toArray(new Mention[0]));
// showEntities(entities);
(new CorefParse(Arrays.asList(parses), entities)).show();
}
PartOfSpeech pos = new OpenNlpConverter().buildPOS(chosen_parse);
// extract frame
SemanticFrame searchFrame = pos.frame();
String frameString = searchFrame.toFrameString();
System.out.println("Frame : " + frameString);
// extract verb
Verb verb = searchFrame.getVerb();
// choose lemma
IndexWord iw = Dictionary.getInstance().lookupIndexWord(
net.didion.jwnl.data.POS.VERB, verb.text);
System.out.println("lemma : " + iw.getLemma());
System.out.println("senses : " + iw.getSenseCount());
String verbLemma = iw.getLemma();
// find verb interpretation
File verbnetDir = new File("verbnet-3.1");
if (!verbnetDir.exists()) {
throw new IllegalStateException("Non trovo verbnet : ["
+ verbnetDir.getAbsolutePath() + "]");
}
Map<String, Set<VNCLASS>> frameMembers = new LinkedHashMap<String, Set<VNCLASS>>();
Map<String, Set<VNSUBCLASS>> frameSubclassMembers = new LinkedHashMap<String, Set<VNSUBCLASS>>();
JAXBContext jaxbContext = JAXBContext.newInstance("vn");
Unmarshaller unmarshaller = jaxbContext.createUnmarshaller();
FilenameFilter wildcardFileFilter = new WildcardFileFilter("*.xml");
for (File verbnetFile : verbnetDir.listFiles(wildcardFileFilter)) {
JAXBElement<VNCLASS> jaxbElement;
try {
jaxbElement = unmarshaller.unmarshal(new StreamSource(
verbnetFile), VNCLASS.class);
} catch (Exception e) {
throw new IllegalStateException("Impossibile elaborare : ["
+ verbnetFile.getAbsolutePath() + "]", e);
}
VNCLASS vnclass = jaxbElement.getValue();
for (MEMBER member : vnclass.getMEMBERS().getMEMBER()) {
if (frameMembers.containsKey(member.getName())) {
frameMembers.get(member.getName()).add(vnclass);
} else {
Set<VNCLASS> classes = new HashSet<VNCLASS>();
classes.add(vnclass);
frameMembers.put(member.getName(), classes);
}
}
for (VNSUBCLASS vnsubclass : vnclass.getSUBCLASSES()
.getVNSUBCLASS()) {
for (MEMBER member : vnsubclass.getMEMBERS().getMEMBER()) {
if (frameMembers.containsKey(member.getName())) {
frameMembers.get(member.getName()).add(vnclass);
} else {
Set<VNCLASS> classes = new HashSet<VNCLASS>();
classes.add(vnclass);
frameMembers.put(member.getName(), classes);
}
if (frameSubclassMembers.containsKey(member.getName())) {
frameSubclassMembers.get(member.getName()).add(
vnsubclass);
} else {
Set<VNSUBCLASS> classes = new HashSet<VNSUBCLASS>();
classes.add(vnsubclass);
frameSubclassMembers.put(member.getName(), classes);
}
}
}
}
Set<String> knownPredicates = new HashSet<String>();
knownPredicates.add("discover");
Set<VNCLASS> vnclasses = frameMembers.get(verbLemma);
// Set<VNSUBCLASS> vnSubclasses = frameSubclassMembers.get(verbLemma);
VerbnetInfo verbnetInfo = chooseFrame(searchFrame, knownPredicates,
vnclasses);
int position = 0;
List<Object> npOrVERBOrADJOrADVOrPREPOrLEX = verbnetInfo.frame
.getSYNTAX().getNPOrVERBOrADJOrADVOrPREPOrLEX();
for (final Object object : npOrVERBOrADJOrADVOrPREPOrLEX) {
if (object instanceof NP) {
NP np = (NP) object;
FrameItem frameItem = searchFrame.components.get(position);
frameItem.pos.thematicRole = ThematicRoles.valueOf(np
.getValue().toUpperCase());
}
if (object instanceof VERB) {
FrameItem frameItem = searchFrame.components.get(position);
System.out.println("Verb : " + frameItem.pos.text);
Verb v = ((Verb) frameItem.pos);
v.predicate = Predicates.valueOf(verbnetInfo.predicate
.getValue().toUpperCase());
}
position++;
}
System.out.println(pos);
}
private VerbnetInfo chooseFrame(SemanticFrame searchFrame,
Set<String> knownPredicates, Set<VNCLASS> vnclasses) {
for (VNCLASS vnclass : vnclasses) {
List<FRAME> frames = vnclass.getFRAMES().getFRAME();
for (FRAME frame : frames) {
DESCRIPTION description = frame.getDESCRIPTION();
// TODO enhance frame detection (include "Dative", "Location"
// etc. )
if (searchFrame.coerent(description.getPrimary())) {
for (EXAMPLE example : frame.getEXAMPLES().getEXAMPLE()) {
System.out.println("verbnet frame detected : "
+ description.getPrimary() + " example : "
+ example.getvalue());
}
List<Object> npOrVERBOrADJOrADVOrPREPOrLEX = frame
.getSYNTAX().getNPOrVERBOrADJOrADVOrPREPOrLEX();
int position = 0;
for (Object object : npOrVERBOrADJOrADVOrPREPOrLEX) {
if (object instanceof NP) {
NP np = (NP) object;
FrameItem frameItem = searchFrame.components
.get(position);
System.out.println(np.getValue() + " : "
+ frameItem.pos.text);
}
if (object instanceof VERB) {
FrameItem frameItem = searchFrame.components
.get(position);
System.out.println("Verb : " + frameItem.pos.text);
}
position++;
}
SEMANTICS semantics = frame.getSEMANTICS();
List<PRED> predicates = semantics.getPRED();
for (PRED pred : predicates) {
System.out.println("Predicate : " + pred.getValue());
// TODO handle other predicates
if (knownPredicates.contains(pred.getValue())) {
return new VerbnetInfo(frame, pred);
}
}
}
}
}
throw new IllegalArgumentException("Cannot find frame.");
}
private static String convertToken(String token) {
if (token.equals("(")) {
return "-LRB-";
} else if (token.equals(")")) {
return "-RRB-";
} else if (token.equals("{")) {
return "-LCB-";
} else if (token.equals("}")) {
return "-RCB-";
}
return token;
}
}