package edu.stanford.nlp.pipeline;
import edu.stanford.nlp.ling.Sentence;
import junit.framework.TestCase;
import java.util.ArrayList;
import java.util.List;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.util.ArrayCoreMap;
import edu.stanford.nlp.util.CoreMap;
/** @author jtibs */
public class RegexNERAnnotatorITest extends TestCase {
private static final String MAPPING = "/u/nlp/data/TAC-KBP2010/sentence_extraction/itest_map";
private static RegexNERAnnotator annotator;
@Override
public void setUp() throws Exception {
synchronized(RegexNERAnnotator.class) {
if (annotator == null) {
annotator = new RegexNERAnnotator(MAPPING, false, null);
}
}
}
/**
* Helper method, checks that each token is tagged with the expected NER type.
*/
private static void checkTags(List<CoreLabel> tokens, String ... tags) {
assertEquals(tags.length, tokens.size());
for (int i = 0; i < tags.length; ++i) {
assertEquals("Mismatch for token " + i + " " + tokens.get(i),
tags[i], tokens.get(i).get(CoreAnnotations.NamedEntityTagAnnotation.class));
}
}
public void testBasicMatching() {
String str = "President Barack Obama lives in Chicago , Illinois , " +
"and is a practicing Christian .";
String[] split = str.split(" ");
List<CoreLabel> tokens = Sentence.toCoreLabelList(split);
tokens.get(1).set(CoreAnnotations.NamedEntityTagAnnotation.class, "PERSON");
tokens.get(2).set(CoreAnnotations.NamedEntityTagAnnotation.class, "PERSON");
tokens.get(5).set(CoreAnnotations.NamedEntityTagAnnotation.class, "LOCATION");
tokens.get(7).set(CoreAnnotations.NamedEntityTagAnnotation.class, "LOCATION");
CoreMap sentence = new ArrayCoreMap();
sentence.set(CoreAnnotations.TokensAnnotation.class, tokens);
List<CoreMap> sentences = new ArrayList<CoreMap>();
sentences.add(sentence);
Annotation corpus = new Annotation("President Barack Obama lives in Chicago, Illinois," +
"and is a practicing Christian.");
corpus.set(CoreAnnotations.SentencesAnnotation.class, sentences);
annotator.annotate(corpus);
checkTags(tokens, "TITLE", "PERSON", "PERSON", "O", "O", "LOCATION", "O", "STATE_OR_PROVINCE",
"O", "O", "O", "O", "O", "IDEOLOGY", "O");
}
/**
* Neither the LOCATION nor the ORGANIZATION tags should be overridden, since both
* Ontario (STATE_OR_PROVINCE) and American (NATIONALITY) do not span the entire
* phrase that is NamedEntityTag-annotated.
*/
public void testOverwrite() {
String str = "I like Ontario Place , and I like the Native American Church , too .";
String[] split = str.split(" ");
List<CoreLabel> tokens = Sentence.toCoreLabelList(split);
tokens.get(2).set(CoreAnnotations.NamedEntityTagAnnotation.class, "LOCATION");
tokens.get(3).set(CoreAnnotations.NamedEntityTagAnnotation.class, "LOCATION");
tokens.get(9).set(CoreAnnotations.NamedEntityTagAnnotation.class, "ORGANIZATION");
tokens.get(10).set(CoreAnnotations.NamedEntityTagAnnotation.class, "ORGANIZATION");
tokens.get(11).set(CoreAnnotations.NamedEntityTagAnnotation.class, "ORGANIZATION");
CoreMap sentence = new ArrayCoreMap();
sentence.set(CoreAnnotations.TokensAnnotation.class, tokens);
List<CoreMap> sentences = new ArrayList<CoreMap>();
sentences.add(sentence);
Annotation corpus = new Annotation("I like Ontario Place, and I like the Native" +
"American Church, too.");
corpus.set(CoreAnnotations.SentencesAnnotation.class, sentences);
annotator.annotate(corpus);
checkTags(tokens, "O", "O", "LOCATION", "LOCATION", "O", "O", "O", "O", "O", "ORGANIZATION",
"ORGANIZATION", "ORGANIZATION", "O", "O", "O");
}
/**
* In the mapping file, Christianity is assigned a higher priority than Early Christianity,
* and so Early should not be marked as RELIGION.
*/
public void testPriority() {
String str = "Christianity is of higher regex priority than Early Christianity . ";
String[] split = str.split(" ");
List<CoreLabel> tokens = Sentence.toCoreLabelList(split);
CoreMap sentence = new ArrayCoreMap();
sentence.set(CoreAnnotations.TokensAnnotation.class, tokens);
List<CoreMap> sentences = new ArrayList<CoreMap>();
sentences.add(sentence);
Annotation corpus = new Annotation("Christianity is of higher regex priority than Early " +
"Christianity. ");
corpus.set(CoreAnnotations.SentencesAnnotation.class, sentences);
annotator.annotate(corpus);
checkTags(tokens, "RELIGION", "O", "O", "O", "O", "O", "O", "O", "RELIGION", "O");
}
/**
* Test that if there are no annotations at all, the annotator
* throws an exception. We are happy if we can catch an exception
* and continue, and if we don't get any exceptions, we throw an
* exception of our own.
*/
public void testEmptyAnnotation() {
try {
annotator.annotate(new Annotation(""));
} catch(RuntimeException e) {
return;
}
fail("Never expected to get this far... the annotator should have thrown an exception by now");
}
}