Package org.apache.nutch.searcher

Examples of org.apache.nutch.searcher.Summary$Highlight


    // TODO: check that phrases in the query are matched in the fragment
   
    Token[] tokens = getTokens(text);             // parse text to token array
   
    if (tokens.length == 0)
      return new Summary();
   
    String[] terms = query.getTerms();
    HashSet highlight = new HashSet();            // put query terms in table
    for (int i = 0; i < terms.length; i++)
      highlight.add(terms[i]);
   
    // A list to store document's excerpts.
    // (An excerpt is a Vector full of Fragments and Highlights)
    List excerpts = new ArrayList();
   
    //
    // Iterate through all terms in the document
    //
    int lastExcerptPos = 0;
    for (int i = 0; i < tokens.length; i++) {
      //
      // If we find a term that's in the query...
      //
      if (highlight.contains(tokens[i].term())) {
        //
        // Start searching at a point SUM_CONTEXT terms back,
        // and move SUM_CONTEXT terms into the future.
        //
        int startToken = (i > sumContext) ? i - sumContext : 0;
        int endToken = Math.min(i + sumContext, tokens.length);
        int offset = tokens[startToken].startOffset();
        int j = startToken;
       
        //
        // Iterate from the start point to the finish, adding
        // terms all the way.  The end of the passage is always
        // SUM_CONTEXT beyond the last query-term.
        //
        Excerpt excerpt = new Excerpt(i);
        if (i != 0) {
          excerpt.add(new Summary.Ellipsis());
        }
       
        //
        // Iterate through as long as we're before the end of
        // the document and we haven't hit the max-number-of-items
        // -in-a-summary.
        //
        while ((j < endToken) && (j - startToken < sumLength)) {
          //
          // Now grab the hit-element, if present
          //
          Token t = tokens[j];
          if (highlight.contains(t.term())) {
            excerpt.addToken(t.term());
            excerpt.add(new Fragment(text.substring(offset, t.startOffset())));
            excerpt.add(new Highlight(text.substring(t.startOffset(),t.endOffset())));
            offset = t.endOffset();
            endToken = Math.min(j + sumContext, tokens.length);
          }
         
          j++;
        }
       
        lastExcerptPos = endToken;
       
        //
        // We found the series of search-term hits and added
        // them (with intervening text) to the excerpt.  Now
        // we need to add the trailing edge of text.
        //
        // So if (j < tokens.length) then there is still trailing
        // text to add.  (We haven't hit the end of the source doc.)
        // Add the words since the last hit-term insert.
        //
        if (j < tokens.length) {
          excerpt.add(new Fragment(text.substring(offset,tokens[j].endOffset())));
        }
       
        //
        // Remember how many terms are in this excerpt
        //
        excerpt.setNumTerms(j - startToken);
       
        //
        // Store the excerpt for later sorting
        //
        excerpts.add(excerpt);
       
        //
        // Start SUM_CONTEXT places away.  The next
        // search for relevant excerpts begins at i-SUM_CONTEXT
        //
        i = j + sumContext;
      }
    }
   
    // Sort the excerpts based on their score
    Collections.sort(excerpts, SCORE_COMPARATOR);
   
    //
    // If the target text doesn't appear, then we just
    // excerpt the first SUM_LENGTH words from the document.
    //
    if (excerpts.size() == 0) {
      Excerpt excerpt = new Excerpt(0);
      int excerptLen = Math.min(sumLength, tokens.length);
      lastExcerptPos = excerptLen;
     
      excerpt.add(new Fragment(text.substring(tokens[0].startOffset(), tokens[excerptLen-1].startOffset())));
      excerpt.setNumTerms(excerptLen);
      excerpts.add(excerpt);
    }
   
    //
    // Now choose the best items from the excerpt set.
    // Stop when we have enought excerpts to build our Summary.
    //
    double tokenCount = 0;
    int numExcerpt = excerpts.size()-1;
    List bestExcerpts = new ArrayList();
    while (tokenCount <= sumLength && numExcerpt >= 0) {
      Excerpt excerpt = (Excerpt) excerpts.get(numExcerpt--);
      bestExcerpts.add(excerpt);
      tokenCount += excerpt.getNumTerms();
    }   
    // Sort the best excerpts based on their natural order
    Collections.sort(bestExcerpts, ORDER_COMPARATOR);
   
    //
    // Now build our Summary from the best the excerpts.
    //
    tokenCount = 0;
    numExcerpt = 0;
    Summary s = new Summary();
    while (tokenCount <= sumLength && numExcerpt < bestExcerpts.size()) {
      Excerpt excerpt = (Excerpt) bestExcerpts.get(numExcerpt++);
      double tokenFraction = (1.0 * excerpt.getNumTerms()) / excerpt.numFragments();
      for (Enumeration e = excerpt.elements(); e.hasMoreElements(); ) {
        Fragment f = (Fragment) e.nextElement();
        // Don't add fragments if it takes us over the max-limit
        if (tokenCount + tokenFraction <= sumLength) {
          s.add(f);
        }
        tokenCount += tokenFraction;
      }
    }
   
    if (tokenCount > 0 && lastExcerptPos < tokens.length)
      s.add(new Ellipsis());
    return s;
  }
View Full Code Here


    for (int i=0; i<terms.length; i++) {
      weighted[i] = new WeightedSpanTerm(1.0f, terms[i]);
    }
    Highlighter highlighter = new Highlighter(FORMATTER, new QueryScorer(weighted));
    TokenStream tokens = analyzer.tokenStream("content", new StringReader(text));
    Summary summary = new Summary();
    try {
      // TODO : The max number of fragments (3) should be configurable
      String[] result = highlighter.getBestFragments(tokens, text, 3);
      for (int i=0; i<result.length; i++) {
        String[] parts = result[i].split(SEPARATOR);
        boolean highlight = false;
        for (int j=0; j<parts.length; j++) {
          if (highlight) {
            summary.add(new Highlight(parts[j]));
          } else {
            summary.add(new Fragment(parts[j]));
          }
          highlight = !highlight;
        }
        summary.add(new Ellipsis());
      }
    } catch (Exception e) {
      // Nothing to do...
    }
    return summary;
View Full Code Here

    }

    int summariesLength = in.readInt();
    summaries = new Summary[summariesLength];
    for (int i = 0; i < summariesLength; i++) {
      summaries[i] = new Summary();
      summaries[i].readFields(in);
    }
  }
View Full Code Here

    for (int i=0; i<terms.length; i++) {
      weighted[i] = new WeightedTerm(1.0f, terms[i]);
    }
    Highlighter highlighter = new Highlighter(FORMATTER, new QueryScorer(weighted));
    TokenStream tokens = analyzer.tokenStream("content", new StringReader(text));
    Summary summary = new Summary();
    try {
      // TODO : The max number of fragments (3) should be configurable
      String[] result = highlighter.getBestFragments(tokens, text, 3);
      for (int i=0; i<result.length; i++) {
        String[] parts = result[i].split(SEPARATOR);
        boolean highlight = false;
        for (int j=0; j<parts.length; j++) {
          if (highlight) {
            summary.add(new Highlight(parts[j]));
          } else {
            summary.add(new Fragment(parts[j]));
          }
          highlight = !highlight;
        }
        summary.add(new Ellipsis());
      }
     
      /* TODO MC  BUG resolved 0000029 - if query terms do not occur on text, an empty summary is returned. Now it sends the first tokens. */
      if (result==null || result.length==0) {
        tokens = analyzer.tokenStream("content", new StringReader(text));
             
        Token firstToken=null, lastToken=null;
        Token token=null;
        int maxLen=100; // the same as defined in SimpleFragmenter but it is private
       
        /*
        ArrayList<Token> titleTokens=new ArrayList<Token>();
        ArrayList<Token> textTokens=new ArrayList<Token>();
        boolean titleMatched=false;
        boolean hasMatched=false; // exit match after match title the first time            
       
        // remove title from text. compares pairs of text
        while ((titleMatched || !hasMatched) && (token=tokens.next())!=null) {
         
          if (token.type().equals("<WORD>")) {
         
            if (titleTokens.size()==0) {
              titleTokens.add(token);
            }
            else if (textTokens.size()<titleTokens.size()) {
              textTokens.add(token);
            }
         
            if (textTokens.size()==titleTokens.size()) {
              // compare
              titleMatched=true;
              for (int i=0;i<textTokens.size() && titleMatched;i++) {
                if (!textTokens.get(i).termText().equals(titleTokens.get(i).termText())) {
                  titleMatched=false;   
                }               
              }
              if (titleMatched) { // try to match a larger pattern
                titleTokens.add(textTokens.get(0));
                textTokens.remove(0);
                hasMatched=true;
              }
              else { // remove rest of title from text
                if (hasMatched) {
                  firstToken=textTokens.get(titleTokens.size()-2);                                 
                }
                else { // add one more token to title
                  titleTokens.add(textTokens.get(0));
                    textTokens.remove(0);
                }
              }
            }
          }       
        }
       
        if (textTokens.size()==0) {
          return summary;
        }
                             
        for (int i=0;i<textTokens.size() && textTokens.get(i).endOffset()-firstToken.startOffset()<maxLen;i++) {
          lastToken=textTokens.get(i);
        }
        */
                     
        // read tokens until maxLen
        while ((token=tokens.next())!=null) {       
          if (token.type().equals("<WORD>")) {
            if (firstToken==null) {
              firstToken=token;
            }
            else if (token.endOffset()-firstToken.startOffset()<maxLen) {         
              lastToken=token;                         
            }                   
            else {
              break;
            }
          }
        }       
        if (lastToken==null) {
          lastToken=firstToken;
        }
       
        summary.add(new Fragment(text.substring(firstToken.startOffset(), lastToken.endOffset())));
        summary.add(new Ellipsis());
      }
      /* TODO MC */
     
    } catch (Exception e) {
      // Nothing to do...
View Full Code Here

    for (int i=0; i<terms.length; i++) {
      weighted[i] = new WeightedTerm(1.0f, terms[i]);
    }
    Highlighter highlighter = new Highlighter(FORMATTER, new QueryScorer(weighted));
    TokenStream tokens = analyzer.tokenStream("content", new StringReader(text));
    Summary summary = new Summary();
    try {
      // TODO : The max number of fragments (3) should be configurable
      String[] result = highlighter.getBestFragments(tokens, text, 3);
      for (int i=0; i<result.length; i++) {
        String[] parts = result[i].split(SEPARATOR);
        boolean highlight = false;
        for (int j=0; j<parts.length; j++) {
          if (highlight) {
            summary.add(new Highlight(parts[j]));
          } else {
            summary.add(new Fragment(parts[j]));
          }
          highlight = !highlight;
        }
        summary.add(new Ellipsis());
      }
    } catch (Exception e) {
      // Nothing to do...
    }
    return summary;
View Full Code Here

    // TODO: check that phrases in the query are matched in the fragment
   
    Token[] tokens = getTokens(text);             // parse text to token array
   
    if (tokens.length == 0)
      return new Summary();
   
    String[] terms = query.getTerms();
    HashSet highlight = new HashSet();            // put query terms in table
    for (int i = 0; i < terms.length; i++)
      highlight.add(terms[i]);
   
    // A list to store document's excerpts.
    // (An excerpt is a Vector full of Fragments and Highlights)
    List excerpts = new ArrayList();
   
    //
    // Iterate through all terms in the document
    //
    int lastExcerptPos = 0;
    for (int i = 0; i < tokens.length; i++) {
      //
      // If we find a term that's in the query...
      //
      if (highlight.contains(tokens[i].termText())) {
        //
        // Start searching at a point SUM_CONTEXT terms back,
        // and move SUM_CONTEXT terms into the future.
        //
        int startToken = (i > sumContext) ? i - sumContext : 0;
        int endToken = Math.min(i + sumContext, tokens.length);
        int offset = tokens[startToken].startOffset();
        int j = startToken;
       
        //
        // Iterate from the start point to the finish, adding
        // terms all the way.  The end of the passage is always
        // SUM_CONTEXT beyond the last query-term.
        //
        Excerpt excerpt = new Excerpt(i);
        if (i != 0) {
          excerpt.add(new Summary.Ellipsis());
        }
       
        //
        // Iterate through as long as we're before the end of
        // the document and we haven't hit the max-number-of-items
        // -in-a-summary.
        //
        while ((j < endToken) && (j - startToken < sumLength)) {
          //
          // Now grab the hit-element, if present
          //
          Token t = tokens[j];
          if (highlight.contains(t.termText())) {
            excerpt.addToken(t.termText());
            excerpt.add(new Fragment(text.substring(offset, t.startOffset())));
            excerpt.add(new Highlight(text.substring(t.startOffset(),t.endOffset())));
            offset = t.endOffset();
            endToken = Math.min(j + sumContext, tokens.length);
          }
         
          j++;
        }
       
        lastExcerptPos = endToken;
       
        //
        // We found the series of search-term hits and added
        // them (with intervening text) to the excerpt.  Now
        // we need to add the trailing edge of text.
        //
        // So if (j < tokens.length) then there is still trailing
        // text to add.  (We haven't hit the end of the source doc.)
        // Add the words since the last hit-term insert.
        //
        if (j < tokens.length) {
          excerpt.add(new Fragment(text.substring(offset,tokens[j].endOffset())));
        }
       
        //
        // Remember how many terms are in this excerpt
        //
        excerpt.setNumTerms(j - startToken);
       
        //
        // Store the excerpt for later sorting
        //
        excerpts.add(excerpt);
       
        //
        // Start SUM_CONTEXT places away.  The next
        // search for relevant excerpts begins at i-SUM_CONTEXT
        //
        i = j + sumContext;
      }
    }
   
    // Sort the excerpts based on their score
    Collections.sort(excerpts, SCORE_COMPARATOR);
   
    //
    // If the target text doesn't appear, then we just
    // excerpt the first SUM_LENGTH words from the document.
    //
    if (excerpts.size() == 0) {
      Excerpt excerpt = new Excerpt(0);
      int excerptLen = Math.min(sumLength, tokens.length);
      lastExcerptPos = excerptLen;
     
      excerpt.add(new Fragment(text.substring(tokens[0].startOffset(), tokens[excerptLen-1].startOffset())));
      excerpt.setNumTerms(excerptLen);
      excerpts.add(excerpt);
    }
   
    //
    // Now choose the best items from the excerpt set.
    // Stop when we have enought excerpts to build our Summary.
    //
    double tokenCount = 0;
    int numExcerpt = excerpts.size()-1;
    List bestExcerpts = new ArrayList();
    while (tokenCount <= sumLength && numExcerpt >= 0) {
      Excerpt excerpt = (Excerpt) excerpts.get(numExcerpt--);
      bestExcerpts.add(excerpt);
      tokenCount += excerpt.getNumTerms();
    }   
    // Sort the best excerpts based on their natural order
    Collections.sort(bestExcerpts, ORDER_COMPARATOR);
   
    //
    // Now build our Summary from the best the excerpts.
    //
    tokenCount = 0;
    numExcerpt = 0;
    Summary s = new Summary();
    while (tokenCount <= sumLength && numExcerpt < bestExcerpts.size()) {
      Excerpt excerpt = (Excerpt) bestExcerpts.get(numExcerpt++);
      double tokenFraction = (1.0 * excerpt.getNumTerms()) / excerpt.numFragments();
      for (Enumeration e = excerpt.elements(); e.hasMoreElements(); ) {
        Fragment f = (Fragment) e.nextElement();
        // Don't add fragments if it takes us over the max-limit
        if (tokenCount + tokenFraction <= sumLength) {
          s.add(f);
        }
        tokenCount += tokenFraction;
      }
    }
   
    if (tokenCount > 0 && lastExcerptPos < tokens.length)
      s.add(new Ellipsis());
    return s;
  }
View Full Code Here

    // TODO: check that phrases in the query are matched in the fragment

    Token[] tokens = getTokens(text);             // parse text to token array

    if (tokens.length == 0)
      return new Summary();

    String[] terms = query.getTerms();
    HashSet highlight = new HashSet();            // put query terms in table
    for (int i = 0; i < terms.length; i++)
      highlight.add(terms[i]);

    //
    // Create a SortedSet that ranks excerpts according to
    // how many query terms are present.  An excerpt is
    // a Vector full of Fragments and Highlights
    //
    SortedSet excerptSet = new TreeSet(new Comparator() {
        public int compare(Object o1, Object o2) {
            Excerpt excerpt1 = (Excerpt) o1;
            Excerpt excerpt2 = (Excerpt) o2;

            if (excerpt1 == null && excerpt2 != null) {
                return -1;
            } else if (excerpt1 != null && excerpt2 == null) {
                return 1;
            } else if (excerpt1 == null && excerpt2 == null) {
                return 0;
            }

            int numToks1 = excerpt1.numUniqueTokens();
            int numToks2 = excerpt2.numUniqueTokens();

            if (numToks1 < numToks2) {
                return -1;
            } else if (numToks1 == numToks2) {
                return excerpt1.numFragments() - excerpt2.numFragments();
            } else {
                return 1;
            }
        }
    }
        );

    //
    // Iterate through all terms in the document
    //
    int lastExcerptPos = 0;
    for (int i = 0; i < tokens.length; i++) {
      //
      // If we find a term that's in the query...
      //
      if (highlight.contains(tokens[i].termText())) {
        //
        // Start searching at a point SUM_CONTEXT terms back,
        // and move SUM_CONTEXT terms into the future.
        //
        int startToken = (i > SUM_CONTEXT) ? i-SUM_CONTEXT : 0;
        int endToken = Math.min(i+SUM_CONTEXT, tokens.length);
        int offset = tokens[startToken].startOffset();
        int j = startToken;

        //
        // Iterate from the start point to the finish, adding
        // terms all the way.  The end of the passage is always
        // SUM_CONTEXT beyond the last query-term.
        //
        Excerpt excerpt = new Excerpt();
        if (i != 0) {
            excerpt.add(new Summary.Ellipsis());
        }

        //
        // Iterate through as long as we're before the end of
        // the document and we haven't hit the max-number-of-items
        // -in-a-summary.
        //
        while ((j < endToken) && (j - startToken < SUM_LENGTH)) {
          //
          // Now grab the hit-element, if present
          //
          Token t = tokens[j];
          if (highlight.contains(t.termText())) {
            excerpt.addToken(t.termText());
            excerpt.add(new Fragment(text.substring(offset, t.startOffset())));
            excerpt.add(new Highlight(text.substring(t.startOffset(),t.endOffset())));
            offset = t.endOffset();
            endToken = Math.min(j+SUM_CONTEXT, tokens.length);
          }

          j++;
        }

        lastExcerptPos = endToken;

        //
        // We found the series of search-term hits and added
        // them (with intervening text) to the excerpt.  Now
        // we need to add the trailing edge of text.
        //
        // So if (j < tokens.length) then there is still trailing
        // text to add.  (We haven't hit the end of the source doc.)
        // Add the words since the last hit-term insert.
        //
        if (j < tokens.length) {
          excerpt.add(new Fragment(text.substring(offset,tokens[j].endOffset())));
        }

        //
        // Remember how many terms are in this excerpt
        //
        excerpt.setNumTerms(j - startToken);

        //
        // Store the excerpt for later sorting
        //
        excerptSet.add(excerpt);

        //
        // Start SUM_CONTEXT places away.  The next
        // search for relevant excerpts begins at i-SUM_CONTEXT
        //
        i = j+SUM_CONTEXT;
      }
    }

    //
    // If the target text doesn't appear, then we just
    // excerpt the first SUM_LENGTH words from the document.
    //
    if (excerptSet.size() == 0) {
        Excerpt excerpt = new Excerpt();
        int excerptLen = Math.min(SUM_LENGTH, tokens.length);
        lastExcerptPos = excerptLen;

        excerpt.add(new Fragment(text.substring(tokens[0].startOffset(), tokens[excerptLen-1].startOffset())));
        excerpt.setNumTerms(excerptLen);
        excerptSet.add(excerpt);
    }

    //
    // Now choose the best items from the excerpt set.
    // Stop when our Summary grows too large.
    //
    double tokenCount = 0;
    Summary s = new Summary();
    while (tokenCount <= SUM_LENGTH && excerptSet.size() > 0) {
        Excerpt excerpt = (Excerpt) excerptSet.last();
        excerptSet.remove(excerpt);

        double tokenFraction = (1.0 * excerpt.getNumTerms()) / excerpt.numFragments();
        for (Enumeration e = excerpt.elements(); e.hasMoreElements(); ) {
            Fragment f = (Fragment) e.nextElement();
            // Don't add fragments if it takes us over the max-limit
            if (tokenCount + tokenFraction <= SUM_LENGTH) {
                s.add(f);
            }
            tokenCount += tokenFraction;
        }
    }
   
    if (tokenCount > 0 && lastExcerptPos < tokens.length)
      s.add(new Ellipsis());
    return s;
  }
View Full Code Here

      result.accumulate("indexno", hit.getIndexNo());
      result.accumulate("indexkey", hit.getUniqueKey());
     
      // don't add summaries not including summaries
      if (summaries != null && results.isWithSummary()) {
        Summary summary = summaries[i];
        result.accumulate("summary", summary.toString());
      }
     
      // add the fields from hit details
      JSONObject fields = new JSONObject();
      for (int k = 0; k < detail.getLength(); k++) {
View Full Code Here

    for (int i=0; i<terms.length; i++) {
      weighted[i] = new WeightedTerm(1.0f, terms[i]);
    }
    Highlighter highlighter = new Highlighter(FORMATTER, new QueryScorer(weighted));
    TokenStream tokens = analyzer.tokenStream("content", new StringReader(text));
    Summary summary = new Summary();
    try {
      // TODO : The max number of fragments (3) should be configurable
      String[] result = highlighter.getBestFragments(tokens, text, 3);
      for (int i=0; i<result.length; i++) {
        String[] parts = result[i].split(SEPARATOR);
        boolean highlight = false;
        for (int j=0; j<parts.length; j++) {
          if (highlight) {
            summary.add(new Highlight(parts[j]));
          } else {
            summary.add(new Fragment(parts[j]));
          }
          highlight = !highlight;
        }
        summary.add(new Ellipsis());
      }
    } catch (Exception e) {
      // Nothing to do...
    }
    return summary;
View Full Code Here

    // TODO: check that phrases in the query are matched in the fragment
   
    Token[] tokens = getTokens(text);             // parse text to token array
   
    if (tokens.length == 0)
      return new Summary();
   
    String[] terms = query.getTerms();
    HashSet highlight = new HashSet();            // put query terms in table
    for (int i = 0; i < terms.length; i++)
      highlight.add(terms[i]);
   
    // A list to store document's excerpts.
    // (An excerpt is a Vector full of Fragments and Highlights)
    List excerpts = new ArrayList();
   
    //
    // Iterate through all terms in the document
    //
    int lastExcerptPos = 0;
    for (int i = 0; i < tokens.length; i++) {
      //
      // If we find a term that's in the query...
      //
      if (highlight.contains(tokens[i].termText())) {
        //
        // Start searching at a point SUM_CONTEXT terms back,
        // and move SUM_CONTEXT terms into the future.
        //
        int startToken = (i > sumContext) ? i - sumContext : 0;
        int endToken = Math.min(i + sumContext, tokens.length);
        int offset = tokens[startToken].startOffset();
        int j = startToken;
       
        //
        // Iterate from the start point to the finish, adding
        // terms all the way.  The end of the passage is always
        // SUM_CONTEXT beyond the last query-term.
        //
        Excerpt excerpt = new Excerpt(i);
        if (i != 0) {
          excerpt.add(new Summary.Ellipsis());
        }
       
        //
        // Iterate through as long as we're before the end of
        // the document and we haven't hit the max-number-of-items
        // -in-a-summary.
        //
        while ((j < endToken) && (j - startToken < sumLength)) {
          //
          // Now grab the hit-element, if present
          //
          Token t = tokens[j];
          if (highlight.contains(t.termText())) {
            excerpt.addToken(t.termText());
            excerpt.add(new Fragment(text.substring(offset, t.startOffset())));
            excerpt.add(new Highlight(text.substring(t.startOffset(),t.endOffset())));
            offset = t.endOffset();
            endToken = Math.min(j + sumContext, tokens.length);
          }
         
          j++;
        }
       
        lastExcerptPos = endToken;
       
        //
        // We found the series of search-term hits and added
        // them (with intervening text) to the excerpt.  Now
        // we need to add the trailing edge of text.
        //
        // So if (j < tokens.length) then there is still trailing
        // text to add.  (We haven't hit the end of the source doc.)
        // Add the words since the last hit-term insert.
        //
        if (j < tokens.length) {
          excerpt.add(new Fragment(text.substring(offset,tokens[j].endOffset())));
        }
       
        //
        // Remember how many terms are in this excerpt
        //
        excerpt.setNumTerms(j - startToken);
       
        //
        // Store the excerpt for later sorting
        //
        excerpts.add(excerpt);
       
        //
        // Start SUM_CONTEXT places away.  The next
        // search for relevant excerpts begins at i-SUM_CONTEXT
        //
        i = j + sumContext;
      }
    }
   
    // Sort the excerpts based on their score
    Collections.sort(excerpts, SCORE_COMPARATOR);
   
    //
    // If the target text doesn't appear, then we just
    // excerpt the first SUM_LENGTH words from the document.
    //
    if (excerpts.size() == 0) {
      Excerpt excerpt = new Excerpt(0);
      int excerptLen = Math.min(sumLength, tokens.length);
      lastExcerptPos = excerptLen;
     
      excerpt.add(new Fragment(text.substring(tokens[0].startOffset(), tokens[excerptLen-1].startOffset())));
      excerpt.setNumTerms(excerptLen);
      excerpts.add(excerpt);
    }
   
    //
    // Now choose the best items from the excerpt set.
    // Stop when we have enought excerpts to build our Summary.
    //
    double tokenCount = 0;
    int numExcerpt = excerpts.size()-1;
    List bestExcerpts = new ArrayList();
    while (tokenCount <= sumLength && numExcerpt >= 0) {
      Excerpt excerpt = (Excerpt) excerpts.get(numExcerpt--);
      bestExcerpts.add(excerpt);
      tokenCount += excerpt.getNumTerms();
    }   
    // Sort the best excerpts based on their natural order
    Collections.sort(bestExcerpts, ORDER_COMPARATOR);
   
    //
    // Now build our Summary from the best the excerpts.
    //
    tokenCount = 0;
    numExcerpt = 0;
    Summary s = new Summary();
    while (tokenCount <= sumLength && numExcerpt < bestExcerpts.size()) {
      Excerpt excerpt = (Excerpt) bestExcerpts.get(numExcerpt++);
      double tokenFraction = (1.0 * excerpt.getNumTerms()) / excerpt.numFragments();
      for (Enumeration e = excerpt.elements(); e.hasMoreElements(); ) {
        Fragment f = (Fragment) e.nextElement();
        // Don't add fragments if it takes us over the max-limit
        if (tokenCount + tokenFraction <= sumLength) {
          s.add(f);
        }
        tokenCount += tokenFraction;
      }
    }
   
    if (tokenCount > 0 && lastExcerptPos < tokens.length)
      s.add(new Ellipsis());
    return s;
  }
View Full Code Here

TOP

Related Classes of org.apache.nutch.searcher.Summary$Highlight

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.