Examples of org.apache.nutch.searcher.Summary.Fragment

Package org.apache.nutch.searcher.Summary

Examples of org.apache.nutch.searcher.Summary.Fragment

org.apache.nutch.searcher.Summary.Fragment

  }
  
  /** Test of <code>writable</code> implementation. */
  public void testWritable() throws Exception {
    Summary summary = new Summary();
    summary.add(new Fragment("fragment1.1"));
    summary.add(new Ellipsis());
    summary.add(new Highlight("highlight1"));
    summary.add(new Fragment("fragment1.2"));
    WritableTestUtils.testWritable(summary);
  }

View Full Code Here

        boolean highlight = false;
        for (int j=0; j<parts.length; j++) {
          if (highlight) {
            summary.add(new Highlight(parts[j]));
          } else {
            summary.add(new Fragment(parts[j]));
          }
          highlight = !highlight;
        }
        summary.add(new Ellipsis());
      }

View Full Code Here

          // Now grab the hit-element, if present
          //
          Token t = tokens[j];
          if (highlight.contains(t.termText())) {
            excerpt.addToken(t.termText());
            excerpt.add(new Fragment(text.substring(offset, t.startOffset())));
            excerpt.add(new Highlight(text.substring(t.startOffset(),t.endOffset())));
            offset = t.endOffset();
            endToken = Math.min(j + sumContext, tokens.length);
          }
          
          j++;
        }
        
        lastExcerptPos = endToken;
        
        //
        // We found the series of search-term hits and added
        // them (with intervening text) to the excerpt.  Now
        // we need to add the trailing edge of text.
        //
        // So if (j < tokens.length) then there is still trailing
        // text to add.  (We haven't hit the end of the source doc.)
        // Add the words since the last hit-term insert.
        //
        if (j < tokens.length) {
          excerpt.add(new Fragment(text.substring(offset,tokens[j].endOffset())));
        }
        
        //
        // Remember how many terms are in this excerpt
        //
        excerpt.setNumTerms(j - startToken);
        
        //
        // Store the excerpt for later sorting
        //
        excerpts.add(excerpt);
        
        //
        // Start SUM_CONTEXT places away.  The next
        // search for relevant excerpts begins at i-SUM_CONTEXT
        //
        i = j + sumContext;
      }
    }
    
    // Sort the excerpts based on their score
    Collections.sort(excerpts, SCORE_COMPARATOR);
    
    //
    // If the target text doesn't appear, then we just
    // excerpt the first SUM_LENGTH words from the document.
    //
    if (excerpts.size() == 0) {
      Excerpt excerpt = new Excerpt(0);
      int excerptLen = Math.min(sumLength, tokens.length);
      lastExcerptPos = excerptLen;
      
      excerpt.add(new Fragment(text.substring(tokens[0].startOffset(), tokens[excerptLen-1].startOffset())));
      excerpt.setNumTerms(excerptLen);
      excerpts.add(excerpt);
    }
    
    //
    // Now choose the best items from the excerpt set.
    // Stop when we have enought excerpts to build our Summary.
    //
    double tokenCount = 0;
    int numExcerpt = excerpts.size()-1;
    List bestExcerpts = new ArrayList();
    while (tokenCount <= sumLength && numExcerpt >= 0) {
      Excerpt excerpt = (Excerpt) excerpts.get(numExcerpt--);
      bestExcerpts.add(excerpt);
      tokenCount += excerpt.getNumTerms();
    }    
    // Sort the best excerpts based on their natural order
    Collections.sort(bestExcerpts, ORDER_COMPARATOR);
    
    //
    // Now build our Summary from the best the excerpts.
    //
    tokenCount = 0;
    numExcerpt = 0;
    Summary s = new Summary();
    while (tokenCount <= sumLength && numExcerpt < bestExcerpts.size()) {
      Excerpt excerpt = (Excerpt) bestExcerpts.get(numExcerpt++);
      double tokenFraction = (1.0 * excerpt.getNumTerms()) / excerpt.numFragments();
      for (Enumeration e = excerpt.elements(); e.hasMoreElements(); ) {
        Fragment f = (Fragment) e.nextElement();
        // Don't add fragments if it takes us over the max-limit
        if (tokenCount + tokenFraction <= sumLength) {
          s.add(f);
        }
        tokenCount += tokenFraction;

View Full Code Here

        boolean highlight = false;
        for (int j=0; j<parts.length; j++) {
          if (highlight) {
            summary.add(new Highlight(parts[j]));
          } else {
            summary.add(new Fragment(parts[j]));
          }
          highlight = !highlight;
        }
        summary.add(new Ellipsis());
      }

View Full Code Here

          // Now grab the hit-element, if present
          //
          Token t = tokens[j];
          if (highlight.contains(t.termText())) {
            excerpt.addToken(t.termText());
            excerpt.add(new Fragment(text.substring(offset, t.startOffset())));
            excerpt.add(new Highlight(text.substring(t.startOffset(),t.endOffset())));
            offset = t.endOffset();
            endToken = Math.min(j + sumContext, tokens.length);
          }
          
          j++;
        }
        
        lastExcerptPos = endToken;
        
        //
        // We found the series of search-term hits and added
        // them (with intervening text) to the excerpt.  Now
        // we need to add the trailing edge of text.
        //
        // So if (j < tokens.length) then there is still trailing
        // text to add.  (We haven't hit the end of the source doc.)
        // Add the words since the last hit-term insert.
        //
        if (j < tokens.length) {
          excerpt.add(new Fragment(text.substring(offset,tokens[j].endOffset())));
        }
        
        //
        // Remember how many terms are in this excerpt
        //
        excerpt.setNumTerms(j - startToken);
        
        //
        // Store the excerpt for later sorting
        //
        excerpts.add(excerpt);
        
        //
        // Start SUM_CONTEXT places away.  The next
        // search for relevant excerpts begins at i-SUM_CONTEXT
        //
        i = j + sumContext;
      }
    }
    
    // Sort the excerpts based on their score
    Collections.sort(excerpts, SCORE_COMPARATOR);
    
    //
    // If the target text doesn't appear, then we just
    // excerpt the first SUM_LENGTH words from the document.
    //
    if (excerpts.size() == 0) {
      Excerpt excerpt = new Excerpt(0);
      int excerptLen = Math.min(sumLength, tokens.length);
      lastExcerptPos = excerptLen;
      
      excerpt.add(new Fragment(text.substring(tokens[0].startOffset(), tokens[excerptLen-1].startOffset())));
      excerpt.setNumTerms(excerptLen);
      excerpts.add(excerpt);
    }
    
    //
    // Now choose the best items from the excerpt set.
    // Stop when we have enought excerpts to build our Summary.
    //
    double tokenCount = 0;
    int numExcerpt = excerpts.size()-1;
    List bestExcerpts = new ArrayList();
    while (tokenCount <= sumLength && numExcerpt >= 0) {
      Excerpt excerpt = (Excerpt) excerpts.get(numExcerpt--);
      bestExcerpts.add(excerpt);
      tokenCount += excerpt.getNumTerms();
    }    
    // Sort the best excerpts based on their natural order
    Collections.sort(bestExcerpts, ORDER_COMPARATOR);
    
    //
    // Now build our Summary from the best the excerpts.
    //
    tokenCount = 0;
    numExcerpt = 0;
    Summary s = new Summary();
    while (tokenCount <= sumLength && numExcerpt < bestExcerpts.size()) {
      Excerpt excerpt = (Excerpt) bestExcerpts.get(numExcerpt++);
      double tokenFraction = (1.0 * excerpt.getNumTerms()) / excerpt.numFragments();
      for (Enumeration e = excerpt.elements(); e.hasMoreElements(); ) {
        Fragment f = (Fragment) e.nextElement();
        // Don't add fragments if it takes us over the max-limit
        if (tokenCount + tokenFraction <= sumLength) {
          s.add(f);
        }
        tokenCount += tokenFraction;

View Full Code Here

  }
  


  /** Test of <code>Fragment</code> inner class */
  public void testFragment() {
    Fragment fragment = new Fragment("fragment text");
    assertEquals("fragment text", fragment.getText());
    assertEquals("fragment text", fragment.toString());
    assertFalse(fragment.isEllipsis());
    assertFalse(fragment.isHighlight());
    assertTrue(fragment.equals(new Fragment("fragment text")));
    assertFalse(fragment.equals(new Fragment("some text")));
    assertFalse(fragment.equals(new Ellipsis()));
    assertFalse(fragment.equals(new Highlight("fragment text")));
  }

View Full Code Here

    assertFalse(fragment.equals(new Highlight("fragment text")));
  }


  /** Test of <code>Ellipsis</code> inner class */
  public void testEllipsis() {
    Fragment fragment = new Ellipsis();
    assertEquals(" ... ", fragment.getText());
    assertEquals(" ... ", fragment.toString());
    assertTrue(fragment.isEllipsis());
    assertFalse(fragment.isHighlight());
    assertFalse(fragment.equals(new Fragment("fragment text")));
    assertTrue(fragment.equals(new Ellipsis()));
    assertFalse(fragment.equals(new Highlight("fragment text")));
  }

View Full Code Here

    assertFalse(fragment.equals(new Highlight("fragment text")));
  }


  /** Test of <code>Highlight</code> inner class */
  public void testHighlight() {
    Fragment fragment = new Highlight("highlight text");
    assertEquals("highlight text", fragment.getText());
    assertEquals("highlight text", fragment.toString());
    assertFalse(fragment.isEllipsis());
    assertTrue(fragment.isHighlight());
    assertFalse(fragment.equals(new Fragment("fragment text")));
    assertFalse(fragment.equals(new Ellipsis()));
    assertFalse(fragment.equals(new Highlight("fragment text")));
    assertTrue(fragment.equals(new Highlight("highlight text")));
  }

View Full Code Here


  /** Test of <code>add</code> / <code>get</code> methods */
  public void testAdd() {
    Fragment[] fragments = null;
    Summary summary = new Summary();
    summary.add(new Fragment("fragment1"));
    fragments = summary.getFragments();
    assertEquals(1, fragments.length);
    assertEquals("fragment1", fragments[0].toString());
    summary.add(new Fragment("fragment2"));
    fragments = summary.getFragments();
    assertEquals(2, fragments.length);
    assertEquals("fragment1", fragments[0].toString());
    assertEquals("fragment2", fragments[1].toString());
    summary.add(new Fragment("fragment3"));
    fragments = summary.getFragments();
    assertEquals(3, fragments.length);
    assertEquals("fragment1", fragments[0].toString());
    assertEquals("fragment2", fragments[1].toString());
    assertEquals("fragment3", fragments[2].toString());

View Full Code Here


  /** Test of <code>toString</code> method. */
  public void testToString() {
    Summary summary = new Summary();
    assertEquals("", summary.toString());
    summary.add(new Fragment("fragment1"));
    assertEquals("fragment1", summary.toString());
    summary.add(new Ellipsis());
    assertEquals("fragment1 ... ", summary.toString());
    summary.add(new Highlight("highlight"));
    assertEquals("fragment1 ... highlight", summary.toString());
    summary.add(new Fragment("fragment2"));
    assertEquals("fragment1 ... highlightfragment2", summary.toString());    
  }

View Full Code Here

0 1 2

TOP

Related Classes of org.apache.nutch.searcher.Summary.Fragment

org.apache.nutch.searcher.TestSummary

org.apache.nutch.summary.basic.BasicSummarizer

org.apache.nutch.summary.lucene.LuceneSummarizer

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.