package edu.stanford.nlp.international.arabic.process;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import edu.stanford.nlp.international.arabic.ArabicMorphoFeatureSpecification;
import edu.stanford.nlp.international.morph.MorphoFeatureSpecification;
import edu.stanford.nlp.international.morph.MorphoFeatureSpecification.MorphoFeatureType;
import edu.stanford.nlp.international.morph.MorphoFeatures;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.Sentence;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.util.CollectionUtils;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Pair;
/**
* A class for converting strings to input suitable for processing by
* an IOB sequence model.
*
* @author Spence Green
* @author Will Monroe
*/
public class IOBUtils {
// Training token types.
private enum TokenType { BeginMarker, EndMarker, BothMarker, NoMarker }
// Label inventory
public static final String BeginSymbol = "BEGIN";
public static final String ContinuationSymbol = "CONT";
public static final String NosegSymbol = "NOSEG";
public static final String RewriteSymbol = "REW";
/** @deprecated use RewriteSymbol instead */
public static final String RewriteTahSymbol = "REWTA";
/** @deprecated use RewriteSymbol instead */
public static final String RewriteTareefSymbol = "REWAL";
private static final String BoundarySymbol = ".##.";
private static final String BoundaryChar = ".#.";
// Patterns for tokens that should not be segmented.
private static final Pattern isPunc = Pattern.compile("\\p{Punct}+");
private static final Pattern isDigit = Pattern.compile("\\p{Digit}+");
private static final Pattern notUnicodeArabic = Pattern.compile("\\P{InArabic}+");
// Sets of known clitics for tagging when reconstructing the segmented sequences.
private static final Set<String> arPrefixSet;
private static final Set<String> arSuffixSet;
static {
String arabicPrefixString = "ل ف و م ما ح حا ه ها ك ب س";
arPrefixSet = Collections.unmodifiableSet(Generics.newHashSet(Arrays.asList(arabicPrefixString.split("\\s+"))));
String arabicSuffixString = "ل و ما ه ها هم هن نا كم تن تم ى ي هما ك ب ش";
arSuffixSet = Collections.unmodifiableSet(Generics.newHashSet(Arrays.asList(arabicSuffixString.split("\\s+"))));
}
// Only static methods
private IOBUtils() {}
public static String getBoundaryCharacter() { return BoundaryChar; }
/**
* Convert a String to a list of characters suitable for labeling in an IOB
* segmentation model.
*
* @param tokenList
* @param segMarker
* @param applyRewriteRules add rewrite labels (for training data)
*/
public static List<CoreLabel> StringToIOB(List<CoreLabel> tokenList,
Character segMarker,
boolean applyRewriteRules) {
return StringToIOB(tokenList, segMarker, applyRewriteRules, false);
}
/**
* Convert a String to a list of characters suitable for labeling in an IOB
* segmentation model.
*
* @param tokenList
* @param segMarker
* @param applyRewriteRules add rewrite labels (for training data)
* @param stripRewrites revert training data to old Green & DeNero model (remove
* rewrite labels but still rewrite to try to preserve raw text)
*/
public static List<CoreLabel> StringToIOB(List<CoreLabel> tokenList,
Character segMarker,
boolean applyRewriteRules,
boolean stripRewrites) {
List<CoreLabel> iobList = new ArrayList<CoreLabel>(tokenList.size()*7 + tokenList.size());
final String strSegMarker = String.valueOf(segMarker);
boolean addWhitespace = false;
final int numTokens = tokenList.size();
String lastToken = "";
String currentWord = "";
int wordStartIndex = 0;
for (int i = 0; i < numTokens; ++i) {
// What type of token is this
CoreLabel cl = tokenList.get(i);
if (addWhitespace) {
fillInWordStatistics(iobList, currentWord, wordStartIndex);
currentWord = "";
wordStartIndex = iobList.size() + 1;
iobList.add(createDatum(cl, BoundaryChar, BoundarySymbol));
final CoreLabel boundaryDatum = iobList.get(iobList.size() - 1);
boundaryDatum.setIndex(0);
boundaryDatum.setWord("");
addWhitespace = false;
}
String token = cl.word();
TokenType tokType = getTokenType(token, strSegMarker);
token = stripSegmentationMarkers(token, tokType);
assert token.length() != 0;
if (shouldNotSegment(token)) {
iobList.add(createDatum(cl, token, NosegSymbol));
addWhitespace = true;
} else {
// Iterate over the characters in the token
tokenToDatums(iobList, cl, token, tokType, tokenList.get(i), lastToken, applyRewriteRules, stripRewrites);
addWhitespace = (tokType == TokenType.BeginMarker || tokType == TokenType.NoMarker);
}
currentWord += token;
lastToken = token;
}
fillInWordStatistics(iobList, currentWord, wordStartIndex);
return iobList;
}
/**
* Loops back through all the datums inserted for the most recent word
* and inserts statistics about the word they are a part of. This needs to
* be post hoc because the CoreLabel lists coming from testing data sets
* are pre-segmented (so treating each of those CoreLabels as a "word" lets
* us cheat and get 100% classification accuracy by just looking at whether
* we're at the beginning of a "word").
*
* @param iobList
* @param currentWord
* @param wordStartIndex
*/
private static void fillInWordStatistics(List<CoreLabel> iobList,
String currentWord, int wordStartIndex) {
for (int j = wordStartIndex; j < iobList.size(); j++) {
CoreLabel tok = iobList.get(j);
tok.setIndex(j - wordStartIndex);
tok.setWord(currentWord);
}
}
/**
* Convert token to a sequence of datums and add to iobList.
*
* @param iobList
* @param token
* @param tokType
* @param tokenLabel
* @param lastToken
* @param applyRewriteRules
*/
private static void tokenToDatums(List<CoreLabel> iobList,
CoreLabel cl,
String token,
TokenType tokType,
CoreLabel tokenLabel,
String lastToken,
boolean applyRewriteRules,
boolean stripRewrites) {
if (token.isEmpty()) return;
String lastLabel = ContinuationSymbol;
String firstLabel = BeginSymbol;
String rewritten = cl.get(ArabicDocumentReaderAndWriter.RewrittenArabicAnnotation.class);
boolean crossRefRewrites = true;
if (rewritten == null) {
rewritten = token;
crossRefRewrites = false;
} else {
rewritten = stripSegmentationMarkers(rewritten, tokType);
}
if (applyRewriteRules) {
// Apply Arabic-specific re-write rules
String rawToken = tokenLabel.word();
String tag = tokenLabel.tag();
MorphoFeatureSpecification featureSpec = new ArabicMorphoFeatureSpecification();
featureSpec.activate(MorphoFeatureType.NGEN);
featureSpec.activate(MorphoFeatureType.NNUM);
featureSpec.activate(MorphoFeatureType.DEF);
featureSpec.activate(MorphoFeatureType.TENSE);
MorphoFeatures features = featureSpec.strToFeatures(tag);
// Rule #1 : ت --> ة
if (features.getValue(MorphoFeatureType.NGEN).equals("F") &&
features.getValue(MorphoFeatureType.NNUM).equals("SG") &&
rawToken.endsWith("ت-") &&
!stripRewrites) {
lastLabel = RewriteSymbol;
} else if (rawToken.endsWith("ة-")) {
assert token.endsWith("ة");
token = token.substring(0, token.length() - 1) + "ت";
lastLabel = RewriteSymbol;
}
// Rule #2 : لل --> ل ال
if (lastToken.equals("ل") &&
features.getValue(MorphoFeatureType.DEF).equals("D")) {
if (rawToken.startsWith("-ال")) {
if (!token.startsWith("ا"))
System.err.println("Bad REWAL: " + rawToken + " / " + token);
token = token.substring(1);
rewritten = rewritten.substring(1);
if (!stripRewrites)
firstLabel = RewriteSymbol;
} else if (rawToken.startsWith("-ل")) {
if (!token.startsWith("ل"))
System.err.println("Bad REWAL: " + rawToken + " / " + token);
if (!stripRewrites)
firstLabel = RewriteSymbol;
} else {
System.err.println("Ignoring REWAL: " + rawToken + " / " + token);
}
}
// Rule #3 : ي --> ى
// Rule #4 : ا --> ى
if (rawToken.endsWith("ى-")) {
if (features.getValue(MorphoFeatureType.TENSE) != null) {
// verb: ى becomes ا
token = token.substring(0, token.length() - 1) + "ا";
} else {
// assume preposition:
token = token.substring(0, token.length() - 1) + "ي";
}
if (!stripRewrites)
lastLabel = RewriteSymbol;
} else if (rawToken.equals("علي-") || rawToken.equals("-علي-")) {
if (!stripRewrites)
lastLabel = RewriteSymbol;
}
}
// Create datums and add to iobList
if (token.isEmpty())
System.err.println("Rewriting resulted in empty token: " + tokenLabel.word());
String firstChar = String.valueOf(token.charAt(0));
iobList.add(createDatum(cl, firstChar, firstLabel));
final int numChars = token.length();
if (crossRefRewrites && rewritten.length() != numChars) {
System.err.printf("Rewritten annotation doesn't have correct length: %s>>>%s%n", token, rewritten);
crossRefRewrites = false;
}
for (int j = 1; j < numChars; ++j) {
String charLabel = (j == numChars-1) ? lastLabel : ContinuationSymbol;
String thisChar = String.valueOf(token.charAt(j));
if (crossRefRewrites && !String.valueOf(rewritten.charAt(j)).equals(thisChar))
charLabel = RewriteSymbol;
if (charLabel == ContinuationSymbol && thisChar.equals("ى") && j != numChars - 1)
charLabel = RewriteSymbol; // Assume all mid-word alef maqsura are supposed to be yah
iobList.add(createDatum(cl, thisChar, charLabel));
}
}
/**
* Identify tokens that should not be segmented.
*/
private static boolean shouldNotSegment(String token) {
return (isDigit.matcher(token).find() ||
isPunc.matcher(token).find() ||
notUnicodeArabic.matcher(token).find());
}
/**
* Strip segmentation markers.
*/
private static String stripSegmentationMarkers(String tok, TokenType tokType) {
int beginOffset = (tokType == TokenType.BeginMarker || tokType == TokenType.BothMarker) ? 1 : 0;
int endOffset = (tokType == TokenType.EndMarker || tokType == TokenType.BothMarker) ? tok.length()-1 : tok.length();
return tokType == TokenType.NoMarker ? tok : tok.substring(beginOffset, endOffset);
}
/**
* Create a datum from a string. The CoreAnnotations must correspond to those used by
* SequenceClassifier. The following annotations are copied from the provided
* CoreLabel cl, if present:
* DomainAnnotation
*/
private static CoreLabel createDatum(CoreLabel cl, String token, String label) {
CoreLabel newTok = new CoreLabel();
newTok.set(CoreAnnotations.TextAnnotation.class, token);
newTok.set(CoreAnnotations.CharAnnotation.class, token);
newTok.set(CoreAnnotations.AnswerAnnotation.class, label);
newTok.set(CoreAnnotations.GoldAnswerAnnotation.class, label);
if (cl != null && cl.containsKey(CoreAnnotations.DomainAnnotation.class))
newTok.set(CoreAnnotations.DomainAnnotation.class,
cl.get(CoreAnnotations.DomainAnnotation.class));
return newTok;
}
/**
* Deterministically classify a token.
*/
private static TokenType getTokenType(String token, String segMarker) {
if (segMarker == null || token.equals(segMarker)) {
return TokenType.NoMarker;
}
TokenType tokType = TokenType.NoMarker;
boolean startsWithMarker = token.startsWith(segMarker);
boolean endsWithMarker = token.endsWith(segMarker);
if (startsWithMarker && endsWithMarker) {
tokType = TokenType.BothMarker;
} else if (startsWithMarker) {
tokType = TokenType.BeginMarker;
} else if (endsWithMarker) {
tokType = TokenType.EndMarker;
}
return tokType;
}
/**
* This version is for turning an unsegmented string to an IOB input, i.e.,
* for processing raw text.
*/
public static List<CoreLabel> StringToIOB(String string) {
return StringToIOB(string, null);
}
public static List<CoreLabel> StringToIOB(String str, Character segMarker) {
// Whitespace tokenization
List<CoreLabel> toks = Sentence.toCoreLabelList(str.trim().split("\\s+"));
return StringToIOB(toks, segMarker, false);
}
/**
* Convert a list of labeled characters to a String. Include segmentation markers
* for prefixes and suffixes in the string, and add a space at segmentations.
*/
public static String IOBToString(List<CoreLabel> labeledSequence, String prefixMarker, String suffixMarker) {
return IOBToString(labeledSequence, prefixMarker, suffixMarker, true, true);
}
/**
* Convert a list of labeled characters to a String. Include segmentation markers
* (but no spaces) at segmentation boundaries.
*/
public static String IOBToString(List<CoreLabel> labeledSequence, String segmentationMarker) {
return IOBToString(labeledSequence, segmentationMarker, null, false, true);
}
/**
* Convert a list of labeled characters to a String. Preserve the original (unsegmented) text.
*/
public static String IOBToString(List<CoreLabel> labeledSequence) {
return IOBToString(labeledSequence, null, null, false, false);
}
private static String IOBToString(List<CoreLabel> labeledSequence,
String prefixMarker, String suffixMarker, boolean addSpace, boolean applyRewrites) {
StringBuilder sb = new StringBuilder();
String lastLabel = "";
final boolean addPrefixMarker = prefixMarker != null && prefixMarker.length() > 0;
final boolean addSuffixMarker = suffixMarker != null && suffixMarker.length() > 0;
if (addPrefixMarker || addSuffixMarker)
annotateMarkers(labeledSequence);
final int sequenceLength = labeledSequence.size();
for (int i = 0; i < sequenceLength; ++i) {
CoreLabel labeledChar = labeledSequence.get(i);
String token = labeledChar.get(CoreAnnotations.CharAnnotation.class);
if (addPrefixMarker && token.equals(prefixMarker))
token = "#pm#";
if (addSuffixMarker && token.equals(suffixMarker))
token = "#sm#";
String label = labeledChar.get(CoreAnnotations.AnswerAnnotation.class);
if (token.equals(BoundaryChar)) {
sb.append(" ");
} else if (label.equals(BeginSymbol)) {
if (lastLabel.equals(ContinuationSymbol) || lastLabel.equals(BeginSymbol) ||
lastLabel.equals(RewriteSymbol)) {
if (addPrefixMarker && (!addSpace || addPrefixMarker(i, labeledSequence))) {
sb.append(prefixMarker);
}
if (addSpace) {
sb.append(" ");
}
if (addSuffixMarker && (!addSpace || addSuffixMarker(i, labeledSequence))) {
sb.append(suffixMarker);
}
}
sb.append(token);
} else if (label.equals(ContinuationSymbol) || label.equals(BoundarySymbol)) {
sb.append(token);
} else if (label.equals(NosegSymbol)) {
if ( ! lastLabel.equals(BoundarySymbol) && addSpace) {
sb.append(" ");
}
sb.append(token);
} else if (label.equals(RewriteSymbol) || label.equals("REWAL") || label.equals("REWTA")) {
switch (token) {
case "ت":
case "ه":
sb.append(applyRewrites ? "ة" : token);
break;
case "ل":
sb.append((addPrefixMarker ? prefixMarker : "") +
(addSpace ? " " : "") +
(applyRewrites ? "ال" : "ل"));
break;
case "ي":
case "ا":
sb.append(applyRewrites ? "ى" : token);
break;
case "ى":
sb.append(applyRewrites ? "ي" : token);
break;
default:
// Nonsense rewrite predicted by the classifier--just assume CONT
sb.append(token);
break;
}
} else {
throw new RuntimeException("Unknown label: " + label);
}
lastLabel = label;
}
return sb.toString().trim();
}
private static class PrefixMarkerAnnotation implements CoreAnnotation<Boolean> {
@Override
public Class<Boolean> getType() {
return Boolean.class;
}
}
private static class SuffixMarkerAnnotation implements CoreAnnotation<Boolean> {
@Override
public Class<Boolean> getType() {
return Boolean.class;
}
}
private static void annotateMarkers(List<CoreLabel> labeledSequence) {
StringBuilder segment = new StringBuilder();
List<String> segments = CollectionUtils.makeList();
int wordBegin = 0;
for (int i = 0; i < labeledSequence.size(); i++) {
String token = labeledSequence.get(i).get(CoreAnnotations.CharAnnotation.class);
String label = labeledSequence.get(i).get(CoreAnnotations.AnswerAnnotation.class);
switch (label) {
case BeginSymbol:
if (i != wordBegin) {
segments.add(segment.toString());
segment.setLength(0);
}
segment.append(token);
break;
case BoundarySymbol:
segments.add(segment.toString());
segment.setLength(0);
annotateMarkersOnWord(labeledSequence, wordBegin, i, segments);
segments.clear();
wordBegin = i + 1;
break;
default:
segment.append(token);
break;
}
}
segments.add(segment.toString());
annotateMarkersOnWord(labeledSequence, wordBegin, labeledSequence.size(), segments);
}
private static void annotateMarkersOnWord(List<CoreLabel> labeledSequence,
int wordBegin, int wordEnd, List<String> segments) {
Pair<Integer, Integer> headBounds = getHeadBounds(segments);
int currentIndex = 0;
for (int i = wordBegin; i < wordEnd; i++) {
String label = labeledSequence.get(i).get(CoreAnnotations.AnswerAnnotation.class);
labeledSequence.get(i).set(PrefixMarkerAnnotation.class, Boolean.FALSE);
labeledSequence.get(i).set(SuffixMarkerAnnotation.class, Boolean.FALSE);
if (label.equals(BeginSymbol)) {
// Add prefix markers for BEGIN characters up to and including the start of the head
// (but don't add prefix markers if there aren't any prefixes)
if (currentIndex <= headBounds.first && currentIndex != 0)
labeledSequence.get(i).set(PrefixMarkerAnnotation.class, Boolean.TRUE);
// Add suffix markers for BEGIN characters starting one past the end of the head
// (headBounds.second is one past the end, no need to add one)
if (currentIndex >= headBounds.second)
labeledSequence.get(i).set(SuffixMarkerAnnotation.class, Boolean.TRUE);
currentIndex++;
}
}
}
private static Pair<Integer, Integer> getHeadBounds(List<String> segments) {
final int NOT_FOUND = -1;
int potentialSuffix = segments.size() - 1;
int nonSuffix = NOT_FOUND;
int potentialPrefix = 0;
int nonPrefix = NOT_FOUND;
// Heuristic algorithm for finding the head of a segmented word:
while (true) {
/* Alternate considering suffixes and prefixes (starting with suffix).
*
* If the current segment is a known Arabic {suffix|prefix}, mark it as
* such. Otherwise, stop considering tokens from that direction.
*/
if (nonSuffix == NOT_FOUND){
if (arSuffixSet.contains(segments.get(potentialSuffix)))
potentialSuffix--;
else
nonSuffix = potentialSuffix;
}
if (potentialSuffix < potentialPrefix)
break;
if (nonPrefix == NOT_FOUND) {
if (arPrefixSet.contains(segments.get(potentialPrefix)))
potentialPrefix++;
else
nonPrefix = potentialPrefix;
}
if (potentialSuffix < potentialPrefix || (nonSuffix != NOT_FOUND && nonPrefix != NOT_FOUND))
break;
}
/* Once we have exhausted all known prefixes and suffixes, take the longest
* segment that remains to be the head. Break length ties by picking the first one.
*
* Note that in some cases, no segments will remain (e.g. b# +y), so a
* segmented word may have zero or one heads, but never more than one.
*/
if (potentialSuffix < potentialPrefix) {
// no head--start and end are index of first suffix
if (potentialSuffix + 1 != potentialPrefix)
throw new RuntimeException("Suffix pointer moved too far!");
return Pair.makePair(potentialSuffix + 1, potentialSuffix + 1);
} else {
int headIndex = nonPrefix;
for (int i = nonPrefix + 1; i <= nonSuffix; i++) {
if (segments.get(i).length() > segments.get(headIndex).length())
headIndex = i;
}
return Pair.makePair(headIndex, headIndex + 1);
}
}
private static boolean addPrefixMarker(int focus, List<CoreLabel> labeledSequence) {
return labeledSequence.get(focus).get(PrefixMarkerAnnotation.class).booleanValue();
}
private static boolean addSuffixMarker(int focus, List<CoreLabel> labeledSequence) {
return labeledSequence.get(focus).get(SuffixMarkerAnnotation.class).booleanValue();
}
public static void labelDomain(List<CoreLabel> tokenList, String domain) {
for (CoreLabel cl : tokenList) {
cl.set(CoreAnnotations.DomainAnnotation.class, domain);
}
}
}