Package org.apache.nutch.searcher.Summary

Examples of org.apache.nutch.searcher.Summary.Fragment


          // Now grab the hit-element, if present
          //
          Token t = tokens[j];
          if (highlight.contains(t.term())) {
            excerpt.addToken(t.term());
            excerpt.add(new Fragment(text.substring(offset, t.startOffset())));
            excerpt.add(new Highlight(text.substring(t.startOffset(),t.endOffset())));
            offset = t.endOffset();
            endToken = Math.min(j + sumContext, tokens.length);
          }
         
          j++;
        }
       
        lastExcerptPos = endToken;
       
        //
        // We found the series of search-term hits and added
        // them (with intervening text) to the excerpt.  Now
        // we need to add the trailing edge of text.
        //
        // So if (j < tokens.length) then there is still trailing
        // text to add.  (We haven't hit the end of the source doc.)
        // Add the words since the last hit-term insert.
        //
        if (j < tokens.length) {
          excerpt.add(new Fragment(text.substring(offset,tokens[j].endOffset())));
        }
       
        //
        // Remember how many terms are in this excerpt
        //
        excerpt.setNumTerms(j - startToken);
       
        //
        // Store the excerpt for later sorting
        //
        excerpts.add(excerpt);
       
        //
        // Start SUM_CONTEXT places away.  The next
        // search for relevant excerpts begins at i-SUM_CONTEXT
        //
        i = j + sumContext;
      }
    }
   
    // Sort the excerpts based on their score
    Collections.sort(excerpts, SCORE_COMPARATOR);
   
    //
    // If the target text doesn't appear, then we just
    // excerpt the first SUM_LENGTH words from the document.
    //
    if (excerpts.size() == 0) {
      Excerpt excerpt = new Excerpt(0);
      int excerptLen = Math.min(sumLength, tokens.length);
      lastExcerptPos = excerptLen;
     
      excerpt.add(new Fragment(text.substring(tokens[0].startOffset(), tokens[excerptLen-1].startOffset())));
      excerpt.setNumTerms(excerptLen);
      excerpts.add(excerpt);
    }
   
    //
    // Now choose the best items from the excerpt set.
    // Stop when we have enought excerpts to build our Summary.
    //
    double tokenCount = 0;
    int numExcerpt = excerpts.size()-1;
    List bestExcerpts = new ArrayList();
    while (tokenCount <= sumLength && numExcerpt >= 0) {
      Excerpt excerpt = (Excerpt) excerpts.get(numExcerpt--);
      bestExcerpts.add(excerpt);
      tokenCount += excerpt.getNumTerms();
    }   
    // Sort the best excerpts based on their natural order
    Collections.sort(bestExcerpts, ORDER_COMPARATOR);
   
    //
    // Now build our Summary from the best the excerpts.
    //
    tokenCount = 0;
    numExcerpt = 0;
    Summary s = new Summary();
    while (tokenCount <= sumLength && numExcerpt < bestExcerpts.size()) {
      Excerpt excerpt = (Excerpt) bestExcerpts.get(numExcerpt++);
      double tokenFraction = (1.0 * excerpt.getNumTerms()) / excerpt.numFragments();
      for (Enumeration e = excerpt.elements(); e.hasMoreElements(); ) {
        Fragment f = (Fragment) e.nextElement();
        // Don't add fragments if it takes us over the max-limit
        if (tokenCount + tokenFraction <= sumLength) {
          s.add(f);
        }
        tokenCount += tokenFraction;
View Full Code Here


        boolean highlight = false;
        for (int j=0; j<parts.length; j++) {
          if (highlight) {
            summary.add(new Highlight(parts[j]));
          } else {
            summary.add(new Fragment(parts[j]));
          }
          highlight = !highlight;
        }
        summary.add(new Ellipsis());
      }
View Full Code Here

        boolean highlight = false;
        for (int j=0; j<parts.length; j++) {
          if (highlight) {
            summary.add(new Highlight(parts[j]));
          } else {
            summary.add(new Fragment(parts[j]));
          }
          highlight = !highlight;
        }
        summary.add(new Ellipsis());
      }
     
      /* TODO MC  BUG resolved 0000029 - if query terms do not occur on text, an empty summary is returned. Now it sends the first tokens. */
      if (result==null || result.length==0) {
        tokens = analyzer.tokenStream("content", new StringReader(text));
             
        Token firstToken=null, lastToken=null;
        Token token=null;
        int maxLen=100; // the same as defined in SimpleFragmenter but it is private
       
        /*
        ArrayList<Token> titleTokens=new ArrayList<Token>();
        ArrayList<Token> textTokens=new ArrayList<Token>();
        boolean titleMatched=false;
        boolean hasMatched=false; // exit match after match title the first time            
       
        // remove title from text. compares pairs of text
        while ((titleMatched || !hasMatched) && (token=tokens.next())!=null) {
         
          if (token.type().equals("<WORD>")) {
         
            if (titleTokens.size()==0) {
              titleTokens.add(token);
            }
            else if (textTokens.size()<titleTokens.size()) {
              textTokens.add(token);
            }
         
            if (textTokens.size()==titleTokens.size()) {
              // compare
              titleMatched=true;
              for (int i=0;i<textTokens.size() && titleMatched;i++) {
                if (!textTokens.get(i).termText().equals(titleTokens.get(i).termText())) {
                  titleMatched=false;   
                }               
              }
              if (titleMatched) { // try to match a larger pattern
                titleTokens.add(textTokens.get(0));
                textTokens.remove(0);
                hasMatched=true;
              }
              else { // remove rest of title from text
                if (hasMatched) {
                  firstToken=textTokens.get(titleTokens.size()-2);                                 
                }
                else { // add one more token to title
                  titleTokens.add(textTokens.get(0));
                    textTokens.remove(0);
                }
              }
            }
          }       
        }
       
        if (textTokens.size()==0) {
          return summary;
        }
                             
        for (int i=0;i<textTokens.size() && textTokens.get(i).endOffset()-firstToken.startOffset()<maxLen;i++) {
          lastToken=textTokens.get(i);
        }
        */
                     
        // read tokens until maxLen
        while ((token=tokens.next())!=null) {       
          if (token.type().equals("<WORD>")) {
            if (firstToken==null) {
              firstToken=token;
            }
            else if (token.endOffset()-firstToken.startOffset()<maxLen) {         
              lastToken=token;                         
            }                   
            else {
              break;
            }
          }
        }       
        if (lastToken==null) {
          lastToken=firstToken;
        }
       
        summary.add(new Fragment(text.substring(firstToken.startOffset(), lastToken.endOffset())));
        summary.add(new Ellipsis());
      }
      /* TODO MC */
     
    } catch (Exception e) {
View Full Code Here

  }
 

  /** Test of <code>Fragment</code> inner class */
  public void testFragment() {
    Fragment fragment = new Fragment("fragment text");
    assertEquals("fragment text", fragment.getText());
    assertEquals("fragment text", fragment.toString());
    assertFalse(fragment.isEllipsis());
    assertFalse(fragment.isHighlight());
    assertTrue(fragment.equals(new Fragment("fragment text")));
    assertFalse(fragment.equals(new Fragment("some text")));
    assertFalse(fragment.equals(new Ellipsis()));
    assertFalse(fragment.equals(new Highlight("fragment text")));
  }
View Full Code Here

    assertFalse(fragment.equals(new Highlight("fragment text")));
  }

  /** Test of <code>Ellipsis</code> inner class */
  public void testEllipsis() {
    Fragment fragment = new Ellipsis();
    assertEquals(" ... ", fragment.getText());
    assertEquals(" ... ", fragment.toString());
    assertTrue(fragment.isEllipsis());
    assertFalse(fragment.isHighlight());
    assertFalse(fragment.equals(new Fragment("fragment text")));
    assertTrue(fragment.equals(new Ellipsis()));
    assertFalse(fragment.equals(new Highlight("fragment text")));
  }
View Full Code Here

    assertFalse(fragment.equals(new Highlight("fragment text")));
  }

  /** Test of <code>Highlight</code> inner class */
  public void testHighlight() {
    Fragment fragment = new Highlight("highlight text");
    assertEquals("highlight text", fragment.getText());
    assertEquals("highlight text", fragment.toString());
    assertFalse(fragment.isEllipsis());
    assertTrue(fragment.isHighlight());
    assertFalse(fragment.equals(new Fragment("fragment text")));
    assertFalse(fragment.equals(new Ellipsis()));
    assertFalse(fragment.equals(new Highlight("fragment text")));
    assertTrue(fragment.equals(new Highlight("highlight text")));
  }
View Full Code Here

  /** Test of <code>add</code> / <code>get</code> methods */
  public void testAdd() {
    Fragment[] fragments = null;
    Summary summary = new Summary();
    summary.add(new Fragment("fragment1"));
    fragments = summary.getFragments();
    assertEquals(1, fragments.length);
    assertEquals("fragment1", fragments[0].toString());
    summary.add(new Fragment("fragment2"));
    fragments = summary.getFragments();
    assertEquals(2, fragments.length);
    assertEquals("fragment1", fragments[0].toString());
    assertEquals("fragment2", fragments[1].toString());
    summary.add(new Fragment("fragment3"));
    fragments = summary.getFragments();
    assertEquals(3, fragments.length);
    assertEquals("fragment1", fragments[0].toString());
    assertEquals("fragment2", fragments[1].toString());
    assertEquals("fragment3", fragments[2].toString());
View Full Code Here

  /** Test of <code>toString</code> method. */
  public void testToString() {
    Summary summary = new Summary();
    assertEquals("", summary.toString());
    summary.add(new Fragment("fragment1"));
    assertEquals("fragment1", summary.toString());
    summary.add(new Ellipsis());
    assertEquals("fragment1 ... ", summary.toString());
    summary.add(new Highlight("highlight"));
    assertEquals("fragment1 ... highlight", summary.toString());
    summary.add(new Fragment("fragment2"));
    assertEquals("fragment1 ... highlightfragment2", summary.toString());   
  }
View Full Code Here

  }

  /** Test of <code>toStrings</code>. */
  public void testToStrings() {
    Summary[] summaries = { new Summary(), new Summary() };
    summaries[0].add(new Fragment("fragment1.1"));
    summaries[0].add(new Ellipsis());
    summaries[0].add(new Highlight("highlight1"));
    summaries[0].add(new Fragment("fragment1.2"));
    summaries[1].add(new Fragment("fragment2.1"));
    summaries[1].add(new Ellipsis());
    summaries[1].add(new Highlight("highlight2"));
    summaries[1].add(new Fragment("fragment2.2"));
    String[] strings = Summary.toStrings(summaries);
    assertEquals(2, strings.length);
    assertEquals("fragment1.1 ... highlight1fragment1.2", strings[0]);
    assertEquals("fragment2.1 ... highlight2fragment2.2", strings[1]);
  }
View Full Code Here

    Summary summary1 = new Summary();
    Summary summary2 = new Summary();
    assertFalse(summary1.equals(null));
    assertFalse(summary1.equals(""));
    assertTrue(summary1.equals(summary2));
    summary1.add(new Fragment("text fragment"));
    assertFalse(summary1.equals(summary2));
    summary2.add(new Fragment("text fragment"));
    assertTrue(summary1.equals(summary2));
    summary1.add(new Ellipsis());
    assertFalse(summary1.equals(summary2));
    summary2.add(new Ellipsis());
    assertTrue(summary1.equals(summary2));
    summary1.add(new Highlight("highlight"));
    assertFalse(summary1.equals(summary2));
    summary2.add(new Highlight("highlight"));
    assertTrue(summary1.equals(summary2));
    summary1.add(new Fragment("text fragment"));
    summary2.add(new Fragment("fragment text"));
    assertFalse(summary1.equals(summary2));
  }
View Full Code Here

TOP

Related Classes of org.apache.nutch.searcher.Summary.Fragment

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.