package edu.stanford.nlp.pipeline;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Pair;
import java.util.ArrayList;
import java.util.List;
import java.util.function.Function;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Identifies chunks based on labels that uses IOB like encoding
* Assumes labels have the form <tag>-<type>
* where the tag is a prefix indicating where in the chunk it is.
* Supports various encodings: IO, IOB, IOE, BILOU, SBEIO, []
* The type is
* Example: Bill works for Bank of America
* IO: I-PER O O I-ORG I-ORG I-ORG
* IOB1: B-PER O O B-ORG I-ORG I-ORG
* IOB2: I-PER O O B-ORG I-ORG I-ORG
* IOE1: E-PER O O I-ORG I-ORG E-ORG
* IOE2: I-PER O O I-ORG I-ORG E-ORG
* BILOU: U-PER O O B-ORG I-ORG L-ORG
* SBEIO: S-PER O O B-ORG I-ORG E-ORG
* @author Angel Chang
*/
public class LabeledChunkIdentifier {
/**
* Whether to use or ignore provided tag (the label prefix)
*/
private boolean ignoreProvidedTag = false;
/**
* Label/Type indicating the token is not a part of a chunk
*/
private String negLabel = "O";
/**
* What tag to default to if label/type indicate it is part of a chunk
* (used if type does not match negLabel and
* the tag is not provided or ignoreProvidedTag is set)
*/
private String defaultPosTag = "I";
/**
* What tag to default to if label/type indicate it is not part of a chunk
* (used if type matches negLabel and
* the tag is not provided or ignoreProvidedTag is set)
*/
private String defaultNegTag = "O";
/**
* Find and annotate chunks. Returns list of CoreMap (Annotation) objects.
* @param tokens - List of tokens to look for chunks
* @param totalTokensOffset - Index of tokens to offset by
* @param textKey - Key to use to find the token text
* @param labelKey - Key to use to find the token label (to determine if inside chunk or not)
* @return List of annotations (each as a CoreMap) representing the chunks of tokens
*/
@SuppressWarnings("unchecked")
public List<CoreMap> getAnnotatedChunks(List<CoreLabel> tokens, int totalTokensOffset, Class textKey, Class labelKey)
{
return getAnnotatedChunks(tokens, totalTokensOffset, textKey, labelKey, null, null);
}
@SuppressWarnings("unchecked")
public List<CoreMap> getAnnotatedChunks(List<CoreLabel> tokens, int totalTokensOffset, Class textKey, Class labelKey,
Function<Pair<CoreLabel, CoreLabel>, Boolean> checkTokensCompatible)
{
return getAnnotatedChunks(tokens, totalTokensOffset, textKey, labelKey, null, null, checkTokensCompatible);
}
@SuppressWarnings("unchecked")
public List<CoreMap> getAnnotatedChunks(List<CoreLabel> tokens, int totalTokensOffset,
Class textKey, Class labelKey,
Class tokenChunkKey, Class tokenLabelKey)
{
return getAnnotatedChunks(tokens, totalTokensOffset, textKey, labelKey, tokenChunkKey, tokenLabelKey, null);
}
/**
* Find and annotate chunks. Returns list of CoreMap (Annotation) objects
* each representing a chunk with the following annotations set:
* CharacterOffsetBeginAnnotation - set to CharacterOffsetBeginAnnotation of first token in chunk
* CharacterOffsetEndAnnotation - set to CharacterOffsetEndAnnotation of last token in chunk
* TokensAnnotation - List of tokens in this chunk
* TokenBeginAnnotation - Index of first token in chunk (index in original list of tokens)
* TokenEndAnnotation - Index of last token in chunk (index in original list of tokens)
* TextAnnotation - String representing tokens in this chunks (token text separated by space)
* @param tokens - List of tokens to look for chunks
* @param totalTokensOffset - Index of tokens to offset by
* @param labelKey - Key to use to find the token label (to determine if inside chunk or not)
* @param textKey - Key to use to find the token text
* @param tokenChunkKey - If not null, each token is annotated with the chunk using this key
* @param tokenLabelKey - If not null, each token is annotated with the text associated with the chunk using this key
* @param checkTokensCompatible - If not null, additional check to see if this token and the previous are compatible
* @return List of annotations (each as a CoreMap) representing the chunks of tokens
*/
@SuppressWarnings("unchecked")
public List<CoreMap> getAnnotatedChunks(List<CoreLabel> tokens, int totalTokensOffset,
Class textKey, Class labelKey,
Class tokenChunkKey, Class tokenLabelKey,
Function<Pair<CoreLabel, CoreLabel>, Boolean> checkTokensCompatible)
{
List<CoreMap> chunks = new ArrayList();
LabelTagType prevTagType = null;
int tokenBegin = -1;
for (int i = 0; i < tokens.size(); i++) {
CoreLabel token = tokens.get(i);
String label = (String) token.get(labelKey);
LabelTagType curTagType = getTagType(label);
boolean isCompatible = true;
if (checkTokensCompatible != null) {
CoreLabel prev = null;
if (i > 0) {
prev = tokens.get(i-1);
}
Pair<CoreLabel,CoreLabel> p = Pair.makePair(token, prev);
isCompatible = checkTokensCompatible.apply(p);
}
if (isEndOfChunk(prevTagType, curTagType) || !isCompatible) {
int tokenEnd = i;
if (tokenBegin >= 0 && tokenEnd > tokenBegin) {
CoreMap chunk = ChunkAnnotationUtils.getAnnotatedChunk(tokens, tokenBegin, tokenEnd, totalTokensOffset,
tokenChunkKey, textKey, tokenLabelKey);
chunk.set(labelKey, prevTagType.type);
chunks.add(chunk);
tokenBegin = -1;
}
}
if (isStartOfChunk(prevTagType, curTagType) || (!isCompatible && isChunk(curTagType))) {
if (tokenBegin >= 0) {
throw new RuntimeException("New chunk started, prev chunk not ended yet!");
}
tokenBegin = i;
}
prevTagType = curTagType;
}
if (tokenBegin >= 0) {
CoreMap chunk = ChunkAnnotationUtils.getAnnotatedChunk(tokens, tokenBegin, tokens.size(), totalTokensOffset,
tokenChunkKey, textKey, tokenLabelKey);
chunk.set(labelKey, prevTagType.type);
chunks.add(chunk);
}
// System.out.println("number of chunks " + chunks.size());
return chunks;
}
/**
* Returns whether a chunk ended between the previous and current token
* @param prevTag - the tag of the previous token
* @param prevType - the type of the previous token
* @param curTag - the tag of the current token
* @param curType - the type of the current token
* @return true if the previous token was the last token of a chunk
*/
public static boolean isEndOfChunk(String prevTag, String prevType, String curTag, String curType)
{
boolean chunkEnd = false;
if ( "B".equals(prevTag) && "B".equals(curTag) ) { chunkEnd = true; }
if ( "B".equals(prevTag) && "O".equals(curTag) ) { chunkEnd = true; }
if ( "I".equals(prevTag) && "B".equals(curTag) ) { chunkEnd = true; }
if ( "I".equals(prevTag) && "O".equals(curTag) ) { chunkEnd = true; }
if ( "E".equals(prevTag) || "L".equals(prevTag)
|| "S".equals(prevTag) || "U".equals(prevTag)
|| "[".equals(prevTag) || "]".equals(prevTag)) { chunkEnd = true; }
if (!"O".equals(prevTag) && !".".equals(prevTag) && !prevType.equals(curType)) {
chunkEnd = true;
}
return chunkEnd;
}
/**
* Returns whether a chunk ended between the previous and current token
* @param prev - the label/tag/type of the previous token
* @param cur - the label/tag/type of the current token
* @return true if the previous token was the last token of a chunk
*/
public static boolean isEndOfChunk(LabelTagType prev, LabelTagType cur)
{
if (prev == null) return false;
return isEndOfChunk(prev.tag, prev.type, cur.tag, cur.type);
}
/**
* Returns whether a chunk started between the previous and current token
* @param prevTag - the tag of the previous token
* @param prevType - the type of the previous token
* @param curTag - the tag of the current token
* @param curType - the type of the current token
* @return true if the current token was the first token of a chunk
*/
public static boolean isStartOfChunk(String prevTag, String prevType, String curTag, String curType)
{
boolean chunkStart = false;
boolean prevTagE = "E".equals(prevTag) || "L".equals(prevTag) || "S".equals(prevTag) || "U".equals(prevTag);
boolean curTagE = "E".equals(curTag) || "L".equals(curTag) || "S".equals(curTag) || "U".equals(curTag);
if ( prevTagE && curTagE ) { chunkStart = true; }
if ( prevTagE && "I".equals(curTag) ) { chunkStart = true; }
if ( "O".equals(prevTag) && curTagE ) { chunkStart = true; }
if ( "O".equals(prevTag) && "I".equals(curTag) ) { chunkStart = true; }
if ( "B".equals(curTag) || "S".equals(curTag) || "U".equals(curTag)
|| "[".equals(curTag) || "]".equals(curTag)) { chunkStart = true; }
if (!"O".equals(curTag) && !".".equals(curTag) && !prevType.equals(curType)) {
chunkStart = true;
}
return chunkStart;
}
/**
* Returns whether a chunk started between the previous and current token
* @param prev - the label/tag/type of the previous token
* @param cur - the label/tag/type of the current token
* @return true if the current token was the first token of a chunk
*/
public static boolean isStartOfChunk(LabelTagType prev, LabelTagType cur)
{
if (prev == null) {
return isStartOfChunk("O", "O", cur.tag, cur.type);
} else {
return isStartOfChunk(prev.tag, prev.type, cur.tag, cur.type);
}
}
public static boolean isChunk(LabelTagType cur) {
return (!"O".equals(cur.tag) && !".".equals(cur.tag));
}
private static Pattern labelPattern = Pattern.compile("^([^-]*)-(.*)$");
/**
* Class representing a label, tag and type
*/
public static class LabelTagType
{
public String label;
public String tag;
public String type;
public LabelTagType(String label, String tag, String type)
{
this.label = label;
this.tag = tag;
this.type = type;
}
public boolean typeMatches(LabelTagType other)
{
return this.type.equals(other.type);
}
public String toString()
{
StringBuilder sb = new StringBuilder();
sb.append("(");
sb.append(label).append(",");
sb.append(tag).append(",");
sb.append(type).append(")");
return sb.toString();
}
}
public LabelTagType getTagType(String label)
{
if (label == null) {
return new LabelTagType(negLabel, defaultNegTag, negLabel);
}
String type;
String tag;
Matcher matcher = labelPattern.matcher(label);
if (matcher.matches()) {
if (ignoreProvidedTag) {
type = matcher.group(2);
if (negLabel.equals(type)) {
tag = defaultNegTag;
} else {
tag = defaultPosTag;
}
} else {
tag = matcher.group(1);
type = matcher.group(2);
}
} else {
type = label;
if (negLabel.equals(label)) {
tag = defaultNegTag;
} else {
tag = defaultPosTag;
}
}
return new LabelTagType(label, tag, type);
}
public String getDefaultPosTag() {
return defaultPosTag;
}
public void setDefaultPosTag(String defaultPosTag) {
this.defaultPosTag = defaultPosTag;
}
public String getDefaultNegTag() {
return defaultNegTag;
}
public void setDefaultNegTag(String defaultNegTag) {
this.defaultNegTag = defaultNegTag;
}
public String getNegLabel() {
return negLabel;
}
public void setNegLabel(String negLabel) {
this.negLabel = negLabel;
}
public boolean isIgnoreProvidedTag() {
return ignoreProvidedTag;
}
public void setIgnoreProvidedTag(boolean ignoreProvidedTag) {
this.ignoreProvidedTag = ignoreProvidedTag;
}
}