Package net.paoding.analysis.analyzer.estimate

Source Code of net.paoding.analysis.analyzer.estimate.Estimate

package net.paoding.analysis.analyzer.estimate;

import java.io.IOException;
import java.io.PrintStream;
import java.io.Reader;
import java.io.StringReader;
import java.util.LinkedList;

import net.paoding.analysis.analyzer.PaodingTokenizer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;

public class Estimate {
  private Analyzer analyzer;
  private String print;
  private PrintGate printGate;

  public Estimate() {
    this.setPrint("50");// 默认只打印前50行分词效果
  }

  public Estimate(Analyzer analyzer) {
    setAnalyzer(analyzer);
    this.setPrint("50");// 默认只打印前50行分词效果
  }

  public void setAnalyzer(Analyzer analyzer) {
    this.analyzer = analyzer;
  }

  public Analyzer getAnalyzer() {
    return analyzer;
  }

  public void setPrint(String print) {
    if (print == null || print.length() == 0
        || print.equalsIgnoreCase("null")
        || print.equalsIgnoreCase("no")) {
      printGate = null;
      this.print = null;
    } else {
      printGate = new LinePrintGate();
      printGate.setPrint(print, 10);
      this.print = print;
    }
  }

  public String getPrint() {
    return print;
  }

  public void test(String input) {
    this.test(System.out, input);
  }

  public void test(PrintStream out, String input) {
    Reader reader = new StringReaderEx(input);
    this.test(out, reader);
  }

  public void test(PrintStream out, Reader reader) {
    try {
      long begin = System.currentTimeMillis();
     
      LinkedList<CToken> list = new LinkedList<CToken>();
      int wordsCount = 0;
     
      //collect token
      TokenStream ts = analyzer.tokenStream("", reader);
      ts.reset();
      TermAttribute termAtt = (TermAttribute) ts
          .addAttribute(TermAttribute.class);
      while (ts.incrementToken()) {
        if (printGate != null && printGate.filter(wordsCount)) {
          list.add(new CToken(termAtt.term(), wordsCount));
        }
        wordsCount++;
      }
     
      long end = System.currentTimeMillis();
      int c = 0;
      if (list.size() > 0) {
        for (CToken ctoken : list) {
          c = ctoken.i;
          if (c % 10 == 0) {
            if (c != 0) {
              out.println();
            }
            out.print((c / 10 + 1) + ":\t");
          }
          out.print(ctoken.t + "/");
        }
      }
      if (wordsCount == 0) {
        System.out.println("\tAll are noise characters or words");
      } else {
        if (c % 10 != 1) {
          System.out.println();
        }
        String inputLength = "<未知>";
        if (reader instanceof StringReaderEx) {
          inputLength = "" + ((StringReaderEx) reader).inputLength;
        } else if (ts instanceof PaodingTokenizer) {
          inputLength = "" + ((PaodingTokenizer) ts).getInputLength();
        }
        System.out.println();
        System.out.println("\t分词器" + analyzer.getClass().getName());
        System.out.println("\t内容长度 " + inputLength + "字符, 分 "
            + wordsCount + "个词");
        System.out.println("\t分词耗时 " + (end - begin) + "ms ");
      }
    } catch (IOException e) {
      e.printStackTrace();
    } finally {
      try {
        reader.close();
      } catch (IOException e) {
      }
    }
  }

  // -------------------------------------------

  static class CToken {
    String t;
    int i;

    CToken(String t, int i) {
      this.t = t;
      this.i = i;
    }
  }

  static interface PrintGate {
    public void setPrint(String print, int unitSize);

    boolean filter(int count);
  }

  static class PrintGateToken implements PrintGate {
    private int begin;
    private int end;

    public void setBegin(int begin) {
      this.begin = begin;
    }

    public void setEnd(int end) {
      this.end = end;
    }

    public void setPrint(String print, int unitSize) {
      int i = print.indexOf('-');
      if (i > 0) {
        int bv = Integer.parseInt(print.substring(0, i));
        int ev = Integer.parseInt(print.substring(i + 1));
        setBegin(unitSize * (Math.abs(bv) - 1));// 第5行,是从第40开始的
        setEnd(unitSize * Math.abs(ev));// 到第10行,是截止于100(不包含该边界)
      } else {
        setBegin(0);
        int v = Integer.parseInt(print);
        setEnd(unitSize * (Math.abs(v)));
      }
    }

    public boolean filter(int count) {
      return count >= begin && count < end;
    }
  }

  static class LinePrintGate implements PrintGate {

    private PrintGate[] list;

    public void setPrint(String print, int unitSize) {
      String[] prints = print.split(",");
      list = new PrintGate[prints.length];
      for (int i = 0; i < prints.length; i++) {
        PrintGateToken pg = new PrintGateToken();
        pg.setPrint(prints[i], unitSize);
        list[i] = pg;
      }
    }

    public boolean filter(int count) {
      for (int i = 0; i < list.length; i++) {
        if (list[i].filter(count)) {
          return true;
        }
      }
      return false;
    }

  }

  static class StringReaderEx extends StringReader {
    private int inputLength;

    public StringReaderEx(String s) {
      super(s);
      inputLength = s.length();
    }
  }

}
TOP

Related Classes of net.paoding.analysis.analyzer.estimate.Estimate

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.