Package org.apache.nutch.summary.lucene

Source Code of org.apache.nutch.summary.lucene.LuceneSummarizer

/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.summary.lucene;

// JDK imports
import java.io.StringReader;

// Hadoop imports
import org.apache.hadoop.conf.Configuration;

// Lucene imports
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.WeightedTerm;

// Nutch imports
import org.apache.nutch.analysis.NutchDocumentAnalyzer;
import org.apache.nutch.searcher.Query;
import org.apache.nutch.searcher.Summarizer;
import org.apache.nutch.searcher.Summary;
import org.apache.nutch.searcher.Summary.Ellipsis;
import org.apache.nutch.searcher.Summary.Fragment;
import org.apache.nutch.searcher.Summary.Highlight;


/** Implements hit summarization. */
public class LuceneSummarizer implements Summarizer {
 
  private final static String SEPARATOR = "###";
  private final static Formatter FORMATTER =
          new SimpleHTMLFormatter(SEPARATOR, SEPARATOR);

  /** Converts text to tokens. */
  private Analyzer analyzer = null;
  private Configuration conf = null;
 
  public LuceneSummarizer() { }
 
  private LuceneSummarizer(Configuration conf) {
    setConf(conf);
  }
 
 
  /* ----------------------------- *
   * <implementation:Configurable> *
   * ----------------------------- */
 
  public Configuration getConf() {
    return conf;
  }
 
  public void setConf(Configuration conf) {
    this.conf = conf;
    this.analyzer = new NutchDocumentAnalyzer(conf);
  }
 
  /* ------------------------------ *
   * </implementation:Configurable> *
   * ------------------------------ */
 
 
  /* --------------------------- *
   * <implementation:Summarizer> *
   * --------------------------- */
 
  public Summary getSummary(String text, Query query) {

    String[] terms = query.getTerms();
    WeightedTerm[] weighted = new WeightedTerm[terms.length];
    for (int i=0; i<terms.length; i++) {
      weighted[i] = new WeightedTerm(1.0f, terms[i]);
    }
    Highlighter highlighter = new Highlighter(FORMATTER, new QueryScorer(weighted));
    TokenStream tokens = analyzer.tokenStream("content", new StringReader(text));
    Summary summary = new Summary();
    try {
      // TODO : The max number of fragments (3) should be configurable
      String[] result = highlighter.getBestFragments(tokens, text, 3);
      for (int i=0; i<result.length; i++) {
        String[] parts = result[i].split(SEPARATOR);
        boolean highlight = false;
        for (int j=0; j<parts.length; j++) {
          if (highlight) {
            summary.add(new Highlight(parts[j]));
          } else {
            summary.add(new Fragment(parts[j]));
          }
          highlight = !highlight;
        }
        summary.add(new Ellipsis());
      }
    } catch (Exception e) {
      // Nothing to do...
    }
    return summary;
  }

  /* ---------------------------- *
   * </implementation:Summarizer> *
   * ---------------------------- */
 
}
TOP

Related Classes of org.apache.nutch.summary.lucene.LuceneSummarizer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.