Examples of Summary


Examples of org.apache.hadoop.zebra.mapred.ArticleGenerator.Summary

    LOG.info("Cleaning directory: " + batchName);
    fileSys.delete(batchDir, true);
    LOG.info("Generating input files: " + batchName);
    articalGen.batchArticalCreation(fileSys, new Path(srcPath, batchName),
        "doc-", options.srcFiles, options.srcFileLen);
    Summary s = articalGen.getSummary();
    // dumpSummary(s);
    long tmp = 0;
    for (Iterator<Long> it = s.wordCntDist.values().iterator(); it.hasNext(); tmp += it
        .next())
      ;
View Full Code Here

Examples of org.apache.hadoop.zebra.mapred.ArticleGenerator.Summary

   * the ArticleGenerator.
   *
   * @throws IOException
   */
  void verifyWordCount() throws IOException, ParseException {
    Summary expected = new Summary();
    for (Iterator<Summary> it = summary.values().iterator(); it.hasNext();) {
      Summary e = it.next();
      // dumpSummary(e);
      reduce(expected, e);
    }
    // LOG.info("Dumping aggregated Summary");
    // dumpSummary(expected);

    Summary actual = new Summary();
    BasicTable.Reader reader = new BasicTable.Reader(invIndexTablePath, conf);
    reader.setProjection("count");
    TableScanner scanner = reader.getScanner(null, true);
    Tuple tuple = TypesUtils.createTuple(Projection.toSchema(scanner
        .getProjection()));
View Full Code Here

Examples of org.apache.hadoop.zebra.mapred.ArticleGenerator.Summary

    LOG.info("Cleaning directory: " + batchName);
    fileSys.delete(batchDir, true);
    LOG.info("Generating input files: " + batchName);
    articalGen.batchArticalCreation(fileSys, new Path(srcPath, batchName),
        "doc-", options.srcFiles, options.srcFileLen);
    Summary s = articalGen.getSummary();
    // dumpSummary(s);
    long tmp = 0;
    for (Iterator<Long> it = s.wordCntDist.values().iterator(); it.hasNext(); tmp += it
        .next())
      ;
View Full Code Here

Examples of org.apache.hadoop.zebra.mapred.ArticleGenerator.Summary

   * the ArticleGenerator.
   *
   * @throws IOException
   */
  void verifyWordCount() throws IOException, ParseException {
    Summary expected = new Summary();
    for (Iterator<Summary> it = summary.values().iterator(); it.hasNext();) {
      Summary e = it.next();
      // dumpSummary(e);
      reduce(expected, e);
    }
    // LOG.info("Dumping aggregated Summary");
    // dumpSummary(expected);

    Summary actual = new Summary();
    BasicTable.Reader reader = new BasicTable.Reader(invIndexTablePath, conf);
    reader.setProjection("count");
    TableScanner scanner = reader.getScanner(null, true);
    Tuple tuple = TypesUtils.createTuple(Projection.toSchema(scanner
        .getProjection()));
View Full Code Here

Examples of org.apache.hadoop.zebra.mapreduce.ArticleGenerator.Summary

    LOG.info("Cleaning directory: " + batchName);
    fileSys.delete(batchDir, true);
    LOG.info("Generating input files: " + batchName);
    articalGen.batchArticalCreation(fileSys, new Path(srcPath, batchName),
        "doc-", options.srcFiles, options.srcFileLen);
    Summary s = articalGen.getSummary();
    // dumpSummary(s);
    long tmp = 0;
    for (Iterator<Long> it = s.wordCntDist.values().iterator(); it.hasNext(); tmp += it
    .next())
      ;
View Full Code Here

Examples of org.apache.hadoop.zebra.mapreduce.ArticleGenerator.Summary

   * the ArticleGenerator.
   *
   * @throws IOException
   */
  void verifyWordCount() throws IOException, ParseException {
    Summary expected = new Summary();
    for (Iterator<Summary> it = summary.values().iterator(); it.hasNext();) {
      Summary e = it.next();
      // dumpSummary(e);
      reduce(expected, e);
    }
    // LOG.info("Dumping aggregated Summary");
    // dumpSummary(expected);

    Summary actual = new Summary();
    BasicTable.Reader reader = new BasicTable.Reader(invIndexTablePath, conf);
    reader.setProjection("count");
    TableScanner scanner = reader.getScanner(null, true);
    Tuple tuple = TypesUtils.createTuple(Projection.toSchema(scanner
        .getProjection()));
View Full Code Here

Examples of org.apache.hadoop.zebra.mapreduce.ArticleGenerator.Summary

    LOG.info("Cleaning directory: " + batchName);
    fileSys.delete(batchDir, true);
    LOG.info("Generating input files: " + batchName);
    articalGen.batchArticalCreation(fileSys, new Path(srcPath, batchName),
        "doc-", options.srcFiles, options.srcFileLen);
    Summary s = articalGen.getSummary();
    // dumpSummary(s);
    long tmp = 0;
    for (Iterator<Long> it = s.wordCntDist.values().iterator(); it.hasNext(); tmp += it
    .next())
      ;
View Full Code Here

Examples of org.apache.hadoop.zebra.mapreduce.ArticleGenerator.Summary

   * the ArticleGenerator.
   *
   * @throws IOException
   */
  void verifyWordCount() throws IOException, ParseException {
    Summary expected = new Summary();
    for (Iterator<Summary> it = summary.values().iterator(); it.hasNext();) {
      Summary e = it.next();
      // dumpSummary(e);
      reduce(expected, e);
    }
    // LOG.info("Dumping aggregated Summary");
    // dumpSummary(expected);

    Summary actual = new Summary();
    BasicTable.Reader reader = new BasicTable.Reader(invIndexTablePath, conf);
    reader.setProjection("count");
    TableScanner scanner = reader.getScanner(null, true);
    Tuple tuple = TypesUtils.createTuple(Projection.toSchema(scanner
        .getProjection()));
View Full Code Here

Examples of org.apache.nutch.searcher.Summary

    // TODO: check that phrases in the query are matched in the fragment
   
    Token[] tokens = getTokens(text);             // parse text to token array
   
    if (tokens.length == 0)
      return new Summary();
   
    String[] terms = query.getTerms();
    HashSet highlight = new HashSet();            // put query terms in table
    for (int i = 0; i < terms.length; i++)
      highlight.add(terms[i]);
   
    // A list to store document's excerpts.
    // (An excerpt is a Vector full of Fragments and Highlights)
    List excerpts = new ArrayList();
   
    //
    // Iterate through all terms in the document
    //
    int lastExcerptPos = 0;
    for (int i = 0; i < tokens.length; i++) {
      //
      // If we find a term that's in the query...
      //
      if (highlight.contains(tokens[i].term())) {
        //
        // Start searching at a point SUM_CONTEXT terms back,
        // and move SUM_CONTEXT terms into the future.
        //
        int startToken = (i > sumContext) ? i - sumContext : 0;
        int endToken = Math.min(i + sumContext, tokens.length);
        int offset = tokens[startToken].startOffset();
        int j = startToken;
       
        //
        // Iterate from the start point to the finish, adding
        // terms all the way.  The end of the passage is always
        // SUM_CONTEXT beyond the last query-term.
        //
        Excerpt excerpt = new Excerpt(i);
        if (i != 0) {
          excerpt.add(new Summary.Ellipsis());
        }
       
        //
        // Iterate through as long as we're before the end of
        // the document and we haven't hit the max-number-of-items
        // -in-a-summary.
        //
        while ((j < endToken) && (j - startToken < sumLength)) {
          //
          // Now grab the hit-element, if present
          //
          Token t = tokens[j];
          if (highlight.contains(t.term())) {
            excerpt.addToken(t.term());
            excerpt.add(new Fragment(text.substring(offset, t.startOffset())));
            excerpt.add(new Highlight(text.substring(t.startOffset(),t.endOffset())));
            offset = t.endOffset();
            endToken = Math.min(j + sumContext, tokens.length);
          }
         
          j++;
        }
       
        lastExcerptPos = endToken;
       
        //
        // We found the series of search-term hits and added
        // them (with intervening text) to the excerpt.  Now
        // we need to add the trailing edge of text.
        //
        // So if (j < tokens.length) then there is still trailing
        // text to add.  (We haven't hit the end of the source doc.)
        // Add the words since the last hit-term insert.
        //
        if (j < tokens.length) {
          excerpt.add(new Fragment(text.substring(offset,tokens[j].endOffset())));
        }
       
        //
        // Remember how many terms are in this excerpt
        //
        excerpt.setNumTerms(j - startToken);
       
        //
        // Store the excerpt for later sorting
        //
        excerpts.add(excerpt);
       
        //
        // Start SUM_CONTEXT places away.  The next
        // search for relevant excerpts begins at i-SUM_CONTEXT
        //
        i = j + sumContext;
      }
    }
   
    // Sort the excerpts based on their score
    Collections.sort(excerpts, SCORE_COMPARATOR);
   
    //
    // If the target text doesn't appear, then we just
    // excerpt the first SUM_LENGTH words from the document.
    //
    if (excerpts.size() == 0) {
      Excerpt excerpt = new Excerpt(0);
      int excerptLen = Math.min(sumLength, tokens.length);
      lastExcerptPos = excerptLen;
     
      excerpt.add(new Fragment(text.substring(tokens[0].startOffset(), tokens[excerptLen-1].startOffset())));
      excerpt.setNumTerms(excerptLen);
      excerpts.add(excerpt);
    }
   
    //
    // Now choose the best items from the excerpt set.
    // Stop when we have enought excerpts to build our Summary.
    //
    double tokenCount = 0;
    int numExcerpt = excerpts.size()-1;
    List bestExcerpts = new ArrayList();
    while (tokenCount <= sumLength && numExcerpt >= 0) {
      Excerpt excerpt = (Excerpt) excerpts.get(numExcerpt--);
      bestExcerpts.add(excerpt);
      tokenCount += excerpt.getNumTerms();
    }   
    // Sort the best excerpts based on their natural order
    Collections.sort(bestExcerpts, ORDER_COMPARATOR);
   
    //
    // Now build our Summary from the best the excerpts.
    //
    tokenCount = 0;
    numExcerpt = 0;
    Summary s = new Summary();
    while (tokenCount <= sumLength && numExcerpt < bestExcerpts.size()) {
      Excerpt excerpt = (Excerpt) bestExcerpts.get(numExcerpt++);
      double tokenFraction = (1.0 * excerpt.getNumTerms()) / excerpt.numFragments();
      for (Enumeration e = excerpt.elements(); e.hasMoreElements(); ) {
        Fragment f = (Fragment) e.nextElement();
        // Don't add fragments if it takes us over the max-limit
        if (tokenCount + tokenFraction <= sumLength) {
          s.add(f);
        }
        tokenCount += tokenFraction;
      }
    }
   
    if (tokenCount > 0 && lastExcerptPos < tokens.length)
      s.add(new Ellipsis());
    return s;
  }
View Full Code Here

Examples of org.apache.nutch.searcher.Summary

    for (int i=0; i<terms.length; i++) {
      weighted[i] = new WeightedSpanTerm(1.0f, terms[i]);
    }
    Highlighter highlighter = new Highlighter(FORMATTER, new QueryScorer(weighted));
    TokenStream tokens = analyzer.tokenStream("content", new StringReader(text));
    Summary summary = new Summary();
    try {
      // TODO : The max number of fragments (3) should be configurable
      String[] result = highlighter.getBestFragments(tokens, text, 3);
      for (int i=0; i<result.length; i++) {
        String[] parts = result[i].split(SEPARATOR);
        boolean highlight = false;
        for (int j=0; j<parts.length; j++) {
          if (highlight) {
            summary.add(new Highlight(parts[j]));
          } else {
            summary.add(new Fragment(parts[j]));
          }
          highlight = !highlight;
        }
        summary.add(new Ellipsis());
      }
    } catch (Exception e) {
      // Nothing to do...
    }
    return summary;
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.