/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
/**
@author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a>
*/
/*
An error? CoNLLTrue MalletTrue MalletPred
O O O
I-MISC B-MISC B-MISC
B-MISC B-MISC I-MISC
I-MISC B-MISC I-MISC
O O O
O O O
O O O
*/
package cc.mallet.share.casutton.ner; // Generated package name
import java.util.regex.*;
import cc.mallet.extract.StringSpan;
import cc.mallet.extract.StringTokenization;
import cc.mallet.pipe.*;
import cc.mallet.types.*;
/**
* Reads a data file in CoNLL 2003 format, and makes some simple
* transformations.
*
* Unlike the version in <tt>mccallum.ner</tt>, does not expect fields in
* the data file for tags and phrasos if those features are off. Does
* not look for target field if isTargetProcessing() is false.
*/
public class ConllNer2003Sentence2TokenSequence extends Pipe
{
static final String[] endings = new String[]
{"ing", "ed", "ogy", "s", "ly", "ion", "tion", "ity", "ies"};
static Pattern[] endingPatterns = new Pattern[endings.length];
// Indexed by {forward,backward} {0,1,2 offset} {ending char ngram index}
static final String[][][] endingNames = new String[2][3][endings.length];
{
for (int i = 0; i < endings.length; i++) {
endingPatterns[i] = Pattern.compile (".*"+endings[i]+"$");
for (int j = 0; j < 3; j++) {
for (int k = 0; k < 2; k++)
endingNames[k][j][i] = "W"+(k==1?"-":"")+j+"=<END"+endings[i]+">";
}
}
}
boolean saveSource = true;
boolean doConjunctions = false;
boolean doTags = true;
boolean doPhrases = true;
boolean doSpelling = false;
boolean doDigitCollapses = true;
boolean doDowncasing = false;
public ConllNer2003Sentence2TokenSequence ()
{
super (null, new LabelAlphabet());
}
public ConllNer2003Sentence2TokenSequence (boolean useTags, boolean usePhrases)
{
super (null, new LabelAlphabet());
this.doTags = useTags;
this.doPhrases = usePhrases;
}
/* Lines look like this:
-DOCSTART- -X- -X- O
EU NNP I-NP I-ORG
rejects VBZ I-VP O
German JJ I-NP I-MISC
call NN I-NP O
to TO I-VP O
boycott VB I-VP O
British JJ I-NP I-MISC
lamb NN I-NP O
. . O O
Peter NNP I-NP I-PER
Blackburn NNP I-NP I-PER
BRUSSELS NNP I-NP I-LOC
1996-08-22 CD I-NP O
The DT I-NP O
European NNP I-NP I-ORG
Commission NNP I-NP I-ORG
said VBD I-VP O
on IN I-PP O
...
*/
public Instance pipe (Instance carrier)
{
String sentenceLines = (String) carrier.getData();
String[] tokens = sentenceLines.split ("\n");
LabelSequence target = new LabelSequence ((LabelAlphabet)getTargetAlphabet(), tokens.length);
boolean [][] ending = new boolean[3][endings.length];
boolean [][] endingp1 = new boolean[3][endings.length];
boolean [][] endingp2 = new boolean[3][endings.length];
StringBuffer source = saveSource ? new StringBuffer() : null;
TokenSequence data = new StringTokenization (source);
String prevLabel = "NOLABEL";
Pattern ipattern = Pattern.compile ("I-.*");
String word, tag = null, phrase = null, label = null;
for (int i = 0; i < tokens.length; i++) {
if (tokens[i].length() != 0) {
try {
String[] features = tokens[i].split (" ");
int fieldIdx = 0;
word = features[fieldIdx++]; // .toLowerCase();
if (doTags) tag = features[fieldIdx++];
if (doPhrases) phrase = features[fieldIdx++];
if (isTargetProcessing ()) label = features[fieldIdx++];
} catch (ArrayIndexOutOfBoundsException e) {
throw new IllegalArgumentException ("Invalid line "+tokens[i]+" : expected word "
+ (doTags ? ", tag" : "")
+ (doPhrases ? ", phrase" : "")
+ (isTargetProcessing () ? ", target" : "")
+ ".");
}
} else {
word = "-<S>-";
tag = "-<S>-";
phrase = "-<S>-";
label = "O";
}
// Transformations
if (doDigitCollapses) {
if (word.matches ("19\\d\\d"))
word = "<YEAR>";
else if (word.matches ("19\\d\\ds"))
word = "<YEARDECADE>";
else if (word.matches ("19\\d\\d-\\d+"))
word = "<YEARSPAN>";
else if (word.matches ("\\d+\\\\/\\d"))
word = "<FRACTION>";
else if (word.matches ("\\d[\\d,\\.]*"))
word = "<DIGITS>";
else if (word.matches ("19\\d\\d-\\d\\d-\\d--d"))
word = "<DATELINEDATE>";
else if (word.matches ("19\\d\\d-\\d\\d-\\d\\d"))
word = "<DATELINEDATE>";
else if (word.matches (".*-led"))
word = "<LED>";
else if (word.matches (".*-sponsored"))
word = "<LED>";
}
if (doDowncasing)
word = word.toLowerCase();
int start = source.length ();
if (saveSource) {
if (word.equals ("-<S>-")) source.append ("\n\n");
source.append (word); source.append (" ");
}
Token token = new StringSpan (source, start, source.length () - 1);
// Word and tag unigram at current time
if (doSpelling) {
for (int j = 0; j < endings.length; j++) {
ending[2][j] = ending[1][j];
ending[1][j] = ending[0][j];
ending[0][j] = endingPatterns[j].matcher(word).matches();
if (ending[0][j]) token.setFeatureValue (endingNames[0][0][j], 1);
}
}
if (doTags) {
token.setFeatureValue ("T="+tag, 1);
}
if (doPhrases) {
token.setFeatureValue ("P="+phrase, 1);
}
data.add (token);
if (isTargetProcessing ()) {
// Change so each segment always begins with a "B-",
// even if previous token did not have this label.
String oldLabel = label;
if (ipattern.matcher(label).matches ()
&& (prevLabel.length() < 3 // prevLabel is "O"
|| !prevLabel.substring(2).equals (label.substring(2)))) {
label = "B" + oldLabel.substring(1);
}
prevLabel = oldLabel;
target.add (label);
}
}
carrier.setData(data);
if (isTargetProcessing ()) carrier.setTarget(target);
if (saveSource) carrier.setSource(source);
return carrier;
}
// serialization garbage
private static final long serialVersionUID = -7326674871670572522L;
}