package org.sf.mustru.test;
//import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
//import org.apache.lucene.analysis.standard.StandardAnalyzer;
//import org.apache.lucene.analysis.StopAnalyzer;
//import org.apache.lucene.analysis.WhitespaceAnalyzer;
//import org.apache.lucene.analysis.standard.StandardAnalyzer;
//import org.sf.mustru.utils.StandardGapAnalyzer;
import org.sf.mustru.utils.StandardBgramAnalyzer;
//import com.aliasi.util.Files;
/**
* A Class similar to the AnalyzerUtils in Lucene in Action
*/
public class TestAnalyzers
{
public static void main(String[] args)
{
try
{
System.out.println("Started");
// String[] stopwords = {};
// WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer();
// StandardAnalyzer analyzer = new StandardAnalyzer();
StandardBgramAnalyzer analyzer = new StandardBgramAnalyzer();
String str = "Alice was 1987 and 233,999.145$ beginning to get very tired of sitting-by-her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?'";
str = "We were on the east flank of Mt. Kilimanjaro -- at 19,342 feet the highest mountain in Africa. In the pre-dawn hours I locked my eyes onto the";;
// String dirname = "/home/manuk/mustru_bak/test/data/tcat_testing/coffee";
//String filename = "/home/manuk/html/junk/LA122489-0101.txt";
// String str = Files.readFromFile( new File(filename) );
// String str = "The quick brown fox ...";
// String str = "This is a test of the emergency broadcast system";
System.out.println("STRING: " + str);
displayTokensWithDetails(analyzer, str);
}
catch (IOException ie) { System.out.println("IO Error " + ie.getMessage()); }
System.out.println("Ended");
}
private static void displayTokensWithDetails(Analyzer analyzer, String text) throws IOException
{
//*-- get the list of tokens using the passed analyzer
Token[] tokens = tokensFromAnalysis(analyzer, text);
int position = 0;
for (int i = 0; i < tokens.length; i++)
{
Token token = tokens[i];
int increment = token.getPositionIncrement();
if (increment > 0)
{ position = position + increment;
System.out.println();
System.out.print(position + ": ");
}
System.out.print("\t [" + token.termText() + ": " + token.type() + "] " + token.startOffset() + ":" + token.endOffset());
} //*-- end of for
System.out.println("");
}
/**
* Use the passed analyzer to get a list of tokens from the text
*/
private static Token[] tokensFromAnalysis(Analyzer analyzer, String text) throws IOException
{
TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
ArrayList<Token> tokenList = new ArrayList<Token>(); Token token = null;
while ( (token = stream.next()) != null) tokenList.add(token);
Token[] tokens = new Token[tokenList.size()];
for (int i = 0; i < tokens.length; i++) tokens[i] = tokenList.get(i);
return (tokens);
}
}