/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.jackrabbit.core.query.lucene;
import org.apache.lucene.index.TermPositionVector;
import org.apache.lucene.index.TermVectorOffsetInfo;
import org.apache.lucene.util.PriorityQueue;
import java.util.Set;
import java.util.Iterator;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.List;
import java.util.IdentityHashMap;
import java.util.Map;
import java.util.LinkedList;
import java.io.IOException;
/**
* <code>WeightedHighlighter</code> implements a highlighter that weights the
* fragments based on the proximity of the highlighted terms to each other. The
* returned fragments are not necessarily in sequence as the text occurs in the
* content.
*/
public class WeightedHighlighter extends DefaultHighlighter {
/**
* Punctuation characters that mark the end of a sentence.
*/
private static final BitSet PUNCTUATION = new BitSet();
static {
PUNCTUATION.set('.');
PUNCTUATION.set('!');
PUNCTUATION.set(0xa1); // inverted exclamation mark
PUNCTUATION.set('?');
PUNCTUATION.set(0xbf); // inverted question mark
// todo add more
}
protected WeightedHighlighter() {
}
/**
* @param tvec the term position vector for this hit
* @param queryTerms the query terms.
* @param text the original text that was used to create the
* tokens.
* @param excerptStart this string is prepended to the excerpt
* @param excerptEnd this string is appended to the excerpt
* @param fragmentStart this string is prepended to every fragment
* @param fragmentEnd this string is appended to the end of every
* fragement.
* @param hlStart the string used to prepend a highlighted token, for
* example <tt>"<b>"</tt>
* @param hlEnd the string used to append a highlighted token, for
* example <tt>"</b>"</tt>
* @param maxFragments the maximum number of fragments
* @param surround the maximum number of chars surrounding a
* highlighted token
* @return a String with text fragments where tokens from the query are
* highlighted
*/
public static String highlight(TermPositionVector tvec,
Set queryTerms,
String text,
String excerptStart,
String excerptEnd,
String fragmentStart,
String fragmentEnd,
String hlStart,
String hlEnd,
int maxFragments,
int surround) throws IOException {
return new WeightedHighlighter().doHighlight(tvec, queryTerms, text,
excerptStart, excerptEnd, fragmentStart, fragmentEnd, hlStart,
hlEnd, maxFragments, surround);
}
/**
* @param tvec the term position vector for this hit
* @param queryTerms the query terms.
* @param text the original text that was used to create the tokens.
* @param maxFragments the maximum number of fragments
* @param surround the maximum number of chars surrounding a highlighted
* token
* @return a String with text fragments where tokens from the query are
* highlighted
*/
public static String highlight(TermPositionVector tvec,
Set queryTerms,
String text,
int maxFragments,
int surround) throws IOException {
return highlight(tvec, queryTerms, text, START_EXCERPT, END_EXCERPT,
START_FRAGMENT_SEPARATOR, END_FRAGMENT_SEPARATOR,
START_HIGHLIGHT, END_HIGHLIGHT, maxFragments, surround);
}
protected String mergeFragments(TermVectorOffsetInfo[] offsets,
String text,
String excerptStart,
String excerptEnd,
String fragmentStart,
String fragmentEnd,
String hlStart,
String hlEnd,
int maxFragments,
int surround) throws IOException {
if (offsets == null || offsets.length == 0) {
// nothing to highlight
return createDefaultExcerpt(text, excerptStart, excerptEnd,
fragmentStart, fragmentEnd, surround * 2);
}
PriorityQueue bestFragments = new FragmentInfoPriorityQueue(maxFragments);
for (int i = 0; i < offsets.length; i++) {
if (offsets[i].getEndOffset() <= text.length()) {
FragmentInfo fi = new FragmentInfo(offsets[i], surround * 2);
for (int j = i + 1; j < offsets.length; j++) {
if (offsets[j].getEndOffset() > text.length()) {
break;
}
if (!fi.add(offsets[j], text)) {
break;
}
}
bestFragments.insert(fi);
}
}
if (bestFragments.size() == 0) {
return createDefaultExcerpt(text, excerptStart, excerptEnd,
fragmentStart, fragmentEnd, surround * 2);
}
// retrieve fragment infos from queue and fill into list, least
// fragment comes out first
List infos = new LinkedList();
while (bestFragments.size() > 0) {
FragmentInfo fi = (FragmentInfo) bestFragments.pop();
infos.add(0, fi);
}
Map offsetInfos = new IdentityHashMap();
// remove overlapping fragment infos
Iterator it = infos.iterator();
while (it.hasNext()) {
FragmentInfo fi = (FragmentInfo) it.next();
boolean overlap = false;
Iterator fit = fi.iterator();
while (fit.hasNext() && !overlap) {
TermVectorOffsetInfo oi = (TermVectorOffsetInfo) fit.next();
if (offsetInfos.containsKey(oi)) {
overlap = true;
}
}
if (overlap) {
it.remove();
} else {
Iterator oit = fi.iterator();
while (oit.hasNext()) {
offsetInfos.put(oit.next(), null);
}
}
}
// create excerpts
StringBuffer sb = new StringBuffer(excerptStart);
it = infos.iterator();
while (it.hasNext()) {
FragmentInfo fi = (FragmentInfo) it.next();
sb.append(fragmentStart);
int limit = Math.max(0, fi.getStartOffset() / 2 + fi.getEndOffset() / 2 - surround);
int len = startFragment(sb, text, fi.getStartOffset(), limit);
TermVectorOffsetInfo lastOffsetInfo = null;
Iterator fIt = fi.iterator();
while (fIt.hasNext()) {
TermVectorOffsetInfo oi = (TermVectorOffsetInfo) fIt.next();
if (lastOffsetInfo != null) {
// fill in text between terms
sb.append(text.substring(lastOffsetInfo.getEndOffset(), oi.getStartOffset()));
}
sb.append(hlStart);
sb.append(text.substring(oi.getStartOffset(), oi.getEndOffset()));
sb.append(hlEnd);
lastOffsetInfo = oi;
}
limit = Math.min(text.length(), fi.getStartOffset() - len + (surround * 2));
endFragment(sb, text, fi.getEndOffset(), limit);
sb.append(fragmentEnd);
}
sb.append(excerptEnd);
return sb.toString();
}
/**
* Writes the start of a fragment to the string buffer <code>sb</code>. The
* first occurrence of a matching term is indicated by the
* <code>offset</code> into the <code>text</code>.
*
* @param sb where to append the start of the fragment.
* @param text the original text.
* @param offset the start offset of the first matching term in the
* fragment.
* @param limit do not go back further than <code>limit</code>.
* @return the length of the start fragment that was appended to
* <code>sb</code>.
*/
private static int startFragment(StringBuffer sb, String text, int offset, int limit) {
if (limit == 0) {
// append all
sb.append(text.substring(0, offset));
return offset;
}
String intro = "... ";
int start = offset;
for (int i = offset - 1; i >= limit; i--) {
if (Character.isWhitespace(text.charAt(i))) {
// potential start
start = i + 1;
if (i - 1 >= limit && PUNCTUATION.get(text.charAt(i - 1))) {
// start of sentence found
intro = "";
break;
}
}
}
sb.append(intro).append(text.substring(start, offset));
return offset - start;
}
/**
* Writes the end of a fragment to the string buffer <code>sb</code>. The
* last occurrence of a matching term is indicated by the
* <code>offset</code> into the <code>text</code>.
*
* @param sb where to append the start of the fragment.
* @param text the original text.
* @param offset the end offset of the last matching term in the fragment.
* @param limit do not go further than <code>limit</code>.
*/
private static void endFragment(StringBuffer sb, String text, int offset, int limit) {
if (limit == text.length()) {
// append all
sb.append(text.substring(offset));
return;
}
int end = offset;
for (int i = end; i < limit; i++) {
if (Character.isWhitespace(text.charAt(i))) {
// potential end
end = i;
}
}
sb.append(text.substring(offset, end)).append(" ...");
}
private static class FragmentInfo {
ArrayList offsetInfosList;
int startOffset;
int endOffset;
int maxFragmentSize;
int quality;
public FragmentInfo(TermVectorOffsetInfo offsetinfo, int maxFragmentSize) {
offsetInfosList = new ArrayList();
offsetInfosList.add(offsetinfo);
startOffset = offsetinfo.getStartOffset();
endOffset = offsetinfo.getEndOffset();
this.maxFragmentSize = maxFragmentSize;
quality = 0;
}
public boolean add(TermVectorOffsetInfo offsetinfo, String text) {
if (offsetinfo.getEndOffset() > (startOffset + maxFragmentSize)) {
return false;
}
offsetInfosList.add(offsetinfo);
if (offsetinfo.getStartOffset() - endOffset <= 3) {
// boost quality when terms are adjacent
// and only separated by whitespace character
boolean boost = true;
for (int i = endOffset; i < offsetinfo.getStartOffset(); i++) {
if (!Character.isWhitespace(text.charAt(i))) {
boost = false;
break;
}
}
if (boost) {
quality += 10;
} else {
quality++;
}
} else {
quality++;
}
endOffset = offsetinfo.getEndOffset();
return true;
}
public Iterator iterator() {
return offsetInfosList.iterator();
}
public int getStartOffset() {
return startOffset;
}
public int getEndOffset() {
return endOffset;
}
public int getQuality() {
return quality;
}
}
private static class FragmentInfoPriorityQueue extends PriorityQueue {
public FragmentInfoPriorityQueue(int size) {
initialize(size);
}
/**
* Checks the quality of two {@link FragmentInfo} objects. The one with
* the lower quality is considered less than the other. If both
* fragments have the same quality, the one with the higher start offset
* is considered the lesser. This will result in a queue that keeps the
* {@link FragmentInfo} with the best quality.
*/
protected boolean lessThan(Object a, Object b) {
FragmentInfo infoA = (FragmentInfo) a;
FragmentInfo infoB = (FragmentInfo) b;
if (infoA.getQuality() == infoB.getQuality()) {
return infoA.getStartOffset() > infoB.getStartOffset();
}
return infoA.getQuality() < infoB.getQuality();
}
}
}