import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.BufferedInputStream;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.HashSet;
import java.util.Set;
import org.tartarus.snowball.SnowballStemmer;
public class StemmerStopBuilder {
static String[] stemmers = { "danish", "dutch", "english", "finnish",
"french", "german", "hungarian", "italian", "norwegian",
"portuguese", "russian", "spanish", "swedish" };
static String[] stopArray = { "da", "de", "en", "fi", "fr", "de", "hu",
"it", "no", "pt", "ru", "es", "sv", "ua", "iv" };
/**
* @param args
*/
public static void main(String[] args) {
try {
ArrayList<String> stopList = new ArrayList<String>();
for (String stopItem : stopArray) {
BufferedReader in = new BufferedReader(new InputStreamReader(
new FileInputStream("stop/" + stopItem + ".txt"), "UTF-8"));
String line = null;
while ((line = in.readLine()) != null) {
String trimedLine = line.trim();
if (trimedLine.isEmpty())
continue;
stopList.add(trimedLine);
}
in.close();
}
System.out.println("read " + stopList.size() + " words");
Set<String> stopSet = buildStemedStopSet(stopList);
System.out.println("unique stemed " + stopSet.size() + " words");
/* Find peace of js code to replace */
String stopFilterBuffer = readStopFilter(args[0]);
String stopStringList = buildStopList(stopSet);
File file = new File(args[0]);
FileOutputStream fos = new FileOutputStream(file);
/* (?s) - embedded flag expression (http://download-llnw.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html#DOTALL) */
fos.write(stopFilterBuffer.replaceFirst("(?s)var(\\s)+stopList(\\s)*=(\\s)*\\{.+\\};",
Matcher.quoteReplacement(stopStringList)).getBytes("UTF-8"));
fos.flush();
fos.close();
} catch (Exception ex) {
System.out.println(ex);
}
}
private static String readStopFilter(String str) throws Exception {
final File file = new File(str);
final BufferedInputStream bis = new BufferedInputStream(new FileInputStream(file));
final byte [] bytes = new byte[(int)file.length()];
bis.read(bytes);
bis.close();
return new String(bytes);
}
private static String buildStopList(Set<String> stopSet) {
StringBuffer stopListBuffer = new StringBuffer("var stopList = {\n\t\t");
boolean isFirst = true;
for (String stemStopWord : stopSet) {
if (!isFirst)
stopListBuffer.append(",\n\t\t");
else
isFirst = false;
stopListBuffer.append(javaStringLiteral(stemStopWord) + " : null");
}
stopListBuffer.append("\n\t};");
return stopListBuffer.toString();
}
private static Set<String> buildStemedStopSet(ArrayList<String> stopList) throws Exception {
Set<String> stopSet = new HashSet<String>();
for (String stemmerLanguage : stemmers) {
System.out.println(stemmerLanguage);
Class<?> stemClass = Class.forName("org.tartarus.snowball.ext."
+ stemmerLanguage + "Stemmer");
SnowballStemmer stemmer = (SnowballStemmer) stemClass
.newInstance();
for (String stopWord : stopList) {
StringBuilder sb = new StringBuilder();
for (String part : stopWord.split(" ")) {
if (sb.length() > 0)
sb.append(" ");
stemmer.setCurrent(part);
stemmer.stem();
sb.append(stemmer.getCurrent());
}
stopSet.add(sb.toString());
}
}
return stopSet;
}
/*
* http://stackoverflow.com/questions/2453231/how-would-you-convert-a-string-
* to-a-java-string-literal
*/
private static String javaStringLiteral(String str) {
StringBuilder sb = new StringBuilder("\"");
for (int i = 0; i < str.length(); i++) {
char c = str.charAt(i);
if (c == '\n') {
sb.append("\\n");
} else if (c == '\r') {
sb.append("\\r");
} else if (c == '"') {
sb.append("\\\"");
} else if (c == '\\') {
sb.append("\\\\");
} else if (c < 0x20) {
sb.append(String.format("\\%03o", (int) c));
} else if (c >= 0x80) {
sb.append(String.format("\\u%04x", (int) c));
} else {
sb.append(c);
}
}
sb.append("\"");
return sb.toString();
}
}