package cc.twittertools.udf;
import java.io.IOException;
import java.io.StringReader;
import java.util.StringTokenizer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.en.PorterStemFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.Version;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import cc.twittertools.index.LowerCaseEntityPreservingFilter;
public class LuceneTokenizer extends EvalFunc<DataBag>{
TupleFactory mTupleFactory = TupleFactory.getInstance();
BagFactory mBagFactory = BagFactory.getInstance();
public DataBag exec(Tuple input) throws IOException{
try {
DataBag output = mBagFactory.newDefaultBag();
Object o = input.get(0);
if (!(o instanceof String)) {
throw new IOException("Expected input to be chararray, but got " + o.getClass().getName());
}
Tokenizer source = new WhitespaceTokenizer(Version.LUCENE_43, new StringReader((String)o));
TokenStream tokenstream = new LowerCaseEntityPreservingFilter(source);
tokenstream.reset();
while (tokenstream.incrementToken()){
String token = tokenstream.getAttribute(CharTermAttribute.class).toString();
output.add(mTupleFactory.newTuple(token));
}
return output;
} catch (Exception e) {
// error handling goes here
throw new IOException("caught exception",e);
}
}
}