Package

Source Code of StemmerStopBuilder

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.BufferedInputStream;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.HashSet;
import java.util.Set;

import org.tartarus.snowball.SnowballStemmer;

public class StemmerStopBuilder {
  static String[] stemmers = { "danish", "dutch", "english", "finnish",
      "french", "german", "hungarian", "italian", "norwegian",
      "portuguese", "russian", "spanish", "swedish" };

  static String[] stopArray = { "da", "de", "en", "fi", "fr", "de", "hu",
      "it", "no", "pt", "ru", "es", "sv", "ua", "iv" };

  /**
   * @param args
   */

  public static void main(String[] args) {
    try {
      ArrayList<String> stopList = new ArrayList<String>();

      for (String stopItem : stopArray) {
        BufferedReader in = new BufferedReader(new InputStreamReader(
            new FileInputStream("stop/" + stopItem + ".txt"), "UTF-8"));

        String line = null;
        while ((line = in.readLine()) != null) {
          String trimedLine = line.trim();
          if (trimedLine.isEmpty())
            continue;
          stopList.add(trimedLine);
        }

        in.close();
      }

      System.out.println("read " + stopList.size() + " words");

      Set<String> stopSet = buildStemedStopSet(stopList);

      System.out.println("unique stemed " + stopSet.size() + " words");

      /* Find peace of js code to replace */

      String stopFilterBuffer = readStopFilter(args[0]);
      String stopStringList = buildStopList(stopSet);

      File file = new File(args[0]);
      FileOutputStream fos = new FileOutputStream(file);

      /* (?s) - embedded flag expression (http://download-llnw.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html#DOTALL) */

      fos.write(stopFilterBuffer.replaceFirst("(?s)var(\\s)+stopList(\\s)*=(\\s)*\\{.+\\};",
        Matcher.quoteReplacement(stopStringList)).getBytes("UTF-8"));

      fos.flush();
      fos.close();
    } catch (Exception ex) {
      System.out.println(ex);
    }
  }

  private static String readStopFilter(String str) throws Exception {
    final File file = new File(str);
    final BufferedInputStream bis = new BufferedInputStream(new FileInputStream(file));
    final byte [] bytes = new byte[(int)file.length()];
    bis.read(bytes);
    bis.close();

    return new String(bytes);
  }

  private static String buildStopList(Set<String> stopSet) {
    StringBuffer stopListBuffer = new StringBuffer("var stopList = {\n\t\t");
    boolean isFirst = true;

    for (String stemStopWord : stopSet) {
      if (!isFirst)
        stopListBuffer.append(",\n\t\t");
      else
        isFirst = false;
      stopListBuffer.append(javaStringLiteral(stemStopWord) + " : null");
    }

    stopListBuffer.append("\n\t};");

    return stopListBuffer.toString();
  }

  private static Set<String> buildStemedStopSet(ArrayList<String> stopList) throws Exception {
    Set<String> stopSet = new HashSet<String>();

    for (String stemmerLanguage : stemmers) {
      System.out.println(stemmerLanguage);

      Class<?> stemClass = Class.forName("org.tartarus.snowball.ext."
            + stemmerLanguage + "Stemmer");
      SnowballStemmer stemmer = (SnowballStemmer) stemClass
            .newInstance();

      for (String stopWord : stopList) {
        StringBuilder sb = new StringBuilder();

        for (String part : stopWord.split(" ")) {
          if (sb.length() > 0)
            sb.append(" ");
          stemmer.setCurrent(part);
          stemmer.stem();
          sb.append(stemmer.getCurrent());
        }

        stopSet.add(sb.toString());
      }
    }
   
    return stopSet;
  }

  /*
   * http://stackoverflow.com/questions/2453231/how-would-you-convert-a-string-
   * to-a-java-string-literal
   */
  private static String javaStringLiteral(String str) {
    StringBuilder sb = new StringBuilder("\"");
    for (int i = 0; i < str.length(); i++) {
      char c = str.charAt(i);
      if (c == '\n') {
        sb.append("\\n");
      } else if (c == '\r') {
        sb.append("\\r");
      } else if (c == '"') {
        sb.append("\\\"");
      } else if (c == '\\') {
        sb.append("\\\\");
      } else if (c < 0x20) {
        sb.append(String.format("\\%03o", (int) c));
      } else if (c >= 0x80) {
        sb.append(String.format("\\u%04x", (int) c));
      } else {
        sb.append(c);
      }
    }
    sb.append("\"");
    return sb.toString();
  }
}
TOP

Related Classes of StemmerStopBuilder

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.