package com.livingsocial.hive.udf;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.io.Text;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.en.EnglishPossessiveFilter;
import org.apache.lucene.analysis.en.KStemFilter;
import org.apache.lucene.analysis.en.PorterStemFilter;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
/**
* Tokenize: splits a natural language chunk of text into an array of stemmed
* lowercase words. English stop words are excluded from the output.
*
*/
@Description(name = "tokenize", value = "_FUNC_(str) - Splits str"
+ " into an arrays of stemmed words")
public class Tokenize extends UDF {
public ArrayList<Text> evaluate(Text text) throws HiveException {
ArrayList<Text> result = new ArrayList<Text>();
Analyzer analyzer = new MyAnalyzer();
try {
TokenStream stream = analyzer.tokenStream("",
new StringReader(text.toString()));
stream.reset();
while (stream.incrementToken()) {
CharTermAttribute term = stream
.getAttribute(CharTermAttribute.class);
result.add(new Text(term.toString()));
}
} catch (IOException e) {
throw new HiveException(e);
} finally {
analyzer.close();
}
return result;
}
private static class DefaultSetHolder {
static final CharArraySet DEFAULT_STOP_SET = StandardAnalyzer.STOP_WORDS_SET;
}
/**
* Customer Analyzer based on {@link StandardAnalyzer} except using
* {@link KStemFilter} instead of the more aggressive
* {@link PorterStemFilter}. I also added in the {@link ASCIIFoldingFilter}
* in order to remove accents from words, and {@link HTMLStripCharFilter}
* to strip out HTML elements.
*/
private static class MyAnalyzer extends Analyzer {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
Version matchVersion = Version.LUCENE_45;
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(matchVersion, source);
result = new EnglishPossessiveFilter(matchVersion, result);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result,
DefaultSetHolder.DEFAULT_STOP_SET);
result = new ASCIIFoldingFilter(result);
result = new KStemFilter(result);
return new TokenStreamComponents(source, result);
}
@Override
protected Reader initReader(String fieldName, Reader reader) {
return new HTMLStripCharFilter(reader);
}
}
}