package proj.zoie.api.impl;
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
import java.util.Set;
import org.apache.log4j.Logger;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.LogByteSizeMergePolicy;
import org.apache.lucene.index.LogMergePolicy;
import org.apache.lucene.index.MergePolicy;
import org.apache.lucene.index.MergeScheduler;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.SegmentInfos;
* @author ymatsuda
public class ZoieMergePolicy extends LogByteSizeMergePolicy
public static final Logger log = Logger.getLogger(ZoieMergePolicy.class.getName());
public static final int DEFAULT_NUM_LARGE_SEGMENTS = 6;
public static final int DEFAULT_NUM_SMALL_SEGMENTS = 7;
public static final int DEFAULT_MERGE_FACTOR = 6;
private boolean _partialExpunge = false;
private int _numLargeSegments = DEFAULT_NUM_LARGE_SEGMENTS;
private int _maxSmallSegments = DEFAULT_NUM_SMALL_SEGMENTS; // default merge factor plus 1.
private int _maxSegments = _numLargeSegments + _maxSmallSegments;
public ZoieMergePolicy(IndexWriter writer)
super.setMergeFactor(DEFAULT_MERGE_FACTOR);// set default merge factor to 7. Less than 10. Good for search speed.
public void setMergePolicyParams(MergePolicyParams params){
if (params!=null){
protected long size(SegmentInfo info) throws IOException
long byteSize = info.sizeInBytes();
float delRatio = (info.docCount <= 0 ? 0.0f : ((float)info.getDelCount() / (float)info.docCount));
return (info.docCount <= 0 ? byteSize : (long)((float)byteSize * (1.0f - delRatio)));
public void setPartialExpunge(boolean doPartialExpunge)
_partialExpunge = doPartialExpunge;
public boolean getPartialExpunge()
return _partialExpunge;
public void setNumLargeSegments(int numLargeSegments)
if (numLargeSegments < 2)
log.warn("numLargeSegments cannot be less than 2, while " + numLargeSegments + " is requested. Override with 2.");
numLargeSegments = 2;
_numLargeSegments = numLargeSegments;
_maxSegments = _numLargeSegments + 2 * getMergeFactor();
public int getNumLargeSegments()
return _numLargeSegments;
public void setMaxSmallSegments(int maxSmallSegments)
if (maxSmallSegments < getMergeFactor()+1)
log.warn("MergeFactor is " +getMergeFactor() + ". maxSmallSegments is requested to be: "
+ maxSmallSegments + ". Override with mergeFactor + 1, since maxSmallSegments has to be greater than mergeFactor.");
maxSmallSegments = getMergeFactor() + 1;
_maxSmallSegments = maxSmallSegments;
_maxSegments = _numLargeSegments + _maxSmallSegments;
public int getMaxSmallSegments()
return _maxSmallSegments;
public void setMergeFactor(int mergeFactor)
if (mergeFactor<2)
log.warn("mergeFactor has to be at least 2. Override " + mergeFactor + " with 2");
mergeFactor = 2;
if(_maxSmallSegments < getMergeFactor())
log.warn("maxSmallSegments has to be greater than mergeFactor. Override maxSmallSegments to: " + (mergeFactor + 1));
_maxSmallSegments = getMergeFactor() + 1;
_maxSegments = _numLargeSegments + _maxSmallSegments;
private boolean isOptimized(SegmentInfos infos, IndexWriter writer, int maxNumSegments, Set<?> segmentsToOptimize) throws IOException {
final int numSegments = infos.size();
int numToOptimize = 0;
SegmentInfo optimizeInfo = null;
for(int i=0;i<numSegments && numToOptimize <= maxNumSegments;i++) {
final SegmentInfo info =;
if (segmentsToOptimize.contains(info)) {
optimizeInfo = info;
return numToOptimize <= maxNumSegments &&
(numToOptimize != 1 || isOptimized(writer, optimizeInfo));
/** Returns true if this single nfo is optimized (has no
* pending norms or deletes, is in the same dir as the
* writer, and matches the current compound file setting */
private boolean isOptimized(IndexWriter writer, SegmentInfo info)
throws IOException {
return !info.hasDeletions() &&
!info.hasSeparateNorms() &&
info.dir == writer.getDirectory() &&
info.getUseCompoundFile() == getUseCompoundFile();
/** Returns the merges necessary to optimize the index.
* This merge policy defines "optimized" to mean only one
* segment in the index, where that segment has no
* deletions pending nor separate norms, and it is in
* compound file format if the current useCompoundFile
* setting is true. This method returns multiple merges
* (mergeFactor at a time) so the {@link MergeScheduler}
* in use may make use of concurrency. */
public MergeSpecification findMergesForOptimize(SegmentInfos infos, int maxNumSegments, Set segmentsToOptimize) throws IOException {
assert maxNumSegments > 0;
MergeSpecification spec = null;
if (!isOptimized(infos, writer, maxNumSegments, segmentsToOptimize))
// Find the newest (rightmost) segment that needs to
// be optimized (other segments may have been flushed
// since optimize started):
int last = infos.size();
while(last > 0)
final SegmentInfo info =;
if (segmentsToOptimize.contains(info))
if (last > 0)
if (maxNumSegments == 1)
// Since we must optimize down to 1 segment, the
// choice is simple:
boolean useCompoundFile = getUseCompoundFile();
if (last > 1 || !isOptimized(writer,
spec = new MergeSpecification();
spec.add(new OneMerge(infos.range(0, last), useCompoundFile));
else if (last > maxNumSegments)
// find most balanced merges
spec = findBalancedMerges(infos, last, maxNumSegments, _partialExpunge);
return spec;
private MergeSpecification findBalancedMerges(SegmentInfos infos, int infoLen, int maxNumSegments, boolean partialExpunge)
throws IOException
if (infoLen <= maxNumSegments) return null;
MergeSpecification spec = new MergeSpecification();
boolean useCompoundFile = getUseCompoundFile();
// use Viterbi algorithm to find the best segmentation.
// we will try to minimize the size variance of resulting segments.
double[][] variance = createVarianceTable(infos, infoLen, maxNumSegments);
final int maxMergeSegments = infoLen - maxNumSegments + 1;
double[] sumVariance = new double[maxMergeSegments];
int[][] backLink = new int[maxNumSegments][maxMergeSegments];
for(int i = (maxMergeSegments - 1); i >= 0; i--)
sumVariance[i] = variance[0][i];
backLink[0][i] = 0;
for(int i = 1; i < maxNumSegments; i++)
for(int j = (maxMergeSegments - 1); j >= 0; j--)
double minV = Double.MAX_VALUE;
int minK = 0;
for(int k = j; k >= 0; k--)
double v = sumVariance[k] + variance[i + k][j - k];
if(v < minV)
minV = v;
minK = k;
sumVariance[j] = minV;
backLink[i][j] = minK;
// now, trace back the back links to find all merges,
// also find a candidate for partial expunge if requested
int mergeEnd = infoLen;
int prev = maxMergeSegments - 1;
int expungeCandidate = -1;
int maxDelCount = 0;
for(int i = maxNumSegments - 1; i >= 0; i--)
prev = backLink[i][prev];
int mergeStart = i + prev;
if((mergeEnd - mergeStart) > 1)
spec.add(new OneMerge(infos.range(mergeStart, mergeEnd), useCompoundFile));
SegmentInfo info =;
int delCount = info.getDelCount();
if(delCount > maxDelCount)
expungeCandidate = mergeStart;
maxDelCount = delCount;
mergeEnd = mergeStart;
if(partialExpunge && maxDelCount > 0)
// expunge deletes
spec.add(new OneMerge(infos.range(expungeCandidate, expungeCandidate + 1), useCompoundFile));
return spec;
private double[][] createVarianceTable(SegmentInfos infos, int last, int maxNumSegments) throws IOException
int maxMergeSegments = last - maxNumSegments + 1;
double[][] variance = new double[last][maxMergeSegments];
// compute the optimal segment size
long optSize = 0;
long[] sizeArr = new long[last];
for(int i = 0; i < sizeArr.length; i++)
sizeArr[i] = size(;
optSize += sizeArr[i];
optSize = (optSize / maxNumSegments);
for(int i = 0; i < last; i++)
long size = 0;
for(int j = 0; j < maxMergeSegments; j++)
if((i + j) < last)
size += sizeArr[i + j];
double residual = ((double)size/(double)optSize) - 1.0d;
variance[i][j] = residual * residual;
variance[i][j] = Double.NaN;
return variance;
* Finds merges necessary to expunge all deletes from the
* index. The number of large segments will stay the same.
public MergeSpecification findMergesToExpungeDeletes(SegmentInfos infos)
throws CorruptIndexException, IOException
final int numSegs = infos.size();
final int numLargeSegs = (numSegs < _numLargeSegments ? numSegs : _numLargeSegments);
MergeSpecification spec = null;
if(numLargeSegs < numSegs)
SegmentInfos smallSegments = infos.range(numLargeSegs, numSegs);
spec = super.findMergesToExpungeDeletes(smallSegments);
if(spec == null) spec = new MergeSpecification();
for(int i = 0; i < numLargeSegs; i++)
SegmentInfo info =;
spec.add(new OneMerge(infos.range(i, i + 1), getUseCompoundFile()));
return spec;
/** Checks if any merges are now necessary and returns a
* {@link MergePolicy.MergeSpecification} if so.
* This merge policy try to maintain {@link
* #setNumLargeSegments} of large segments in similar sizes.
* {@link LogByteSizeMergePolicy} to small segments.
* Small segments are merged and promoted to a large segment
* when the total size reaches the average size of large segments.
public MergeSpecification findMerges(SegmentInfos infos) throws IOException
final int numSegs = infos.size();
final int numLargeSegs = _numLargeSegments;
if(numSegs <= numLargeSegs) return null;
long totalLargeSegSize = 0;
long totalSmallSegSize = 0;
SegmentInfo info;
// compute the total size of large segments
for(int i = 0; i < numLargeSegs; i++)
info =;
totalLargeSegSize += size(info);
// compute the total size of small segments
for(int i = numLargeSegs; i < numSegs; i++)
info =;
totalSmallSegSize += size(info);
long targetSegSize = (totalLargeSegSize / (numLargeSegs - 1));
if(targetSegSize <= totalSmallSegSize)
// the total size of small segments is big enough,
// promote the small segments to a large segment and do balanced merge,
if(totalSmallSegSize < targetSegSize * 2)
MergeSpecification spec = findBalancedMerges(infos, numLargeSegs, (numLargeSegs - 1), _partialExpunge);
if(spec == null) spec = new MergeSpecification(); // should not happen
spec.add(new OneMerge(infos.range(numLargeSegs, numSegs), getUseCompoundFile()));
return spec;
return findBalancedMerges(infos, numSegs, numLargeSegs, _partialExpunge);
else if(_maxSegments < numSegs)
// we have more than _maxSegments, merge small segments smaller than targetSegSize/4
MergeSpecification spec = new MergeSpecification();
int startSeg = numLargeSegs;
long sizeThreshold = (targetSegSize / 4);
while(startSeg < numSegs)
info =;
if(size(info) < sizeThreshold) break;
spec.add(new OneMerge(infos.range(startSeg, numSegs), getUseCompoundFile()));
return spec;
// apply the log merge policy to small segments.
SegmentInfos smallSegments = infos.range(numLargeSegs, numSegs);
MergeSpecification spec = super.findMerges(smallSegments);
OneMerge expunge = findOneSegmentToExpunge(infos, numLargeSegs);
if(expunge != null)
if(spec == null) spec = new MergeSpecification();
return spec;
private OneMerge findOneSegmentToExpunge(SegmentInfos infos, int maxNumSegments) throws IOException
int expungeCandidate = -1;
int maxDelCount = 0;
for(int i = maxNumSegments - 1; i >= 0; i--)
SegmentInfo info =;
int delCount = info.getDelCount();
if(delCount > maxDelCount)
expungeCandidate = i;
maxDelCount = delCount;
if(maxDelCount > 0)
return new OneMerge(infos.range(expungeCandidate, expungeCandidate + 1), getUseCompoundFile());
return null;
public static class MergePolicyParams
public static final Logger log = Logger.getLogger(ZoieMergePolicy.MergePolicyParams.class.getName());
private int _numLargeSegments;
private int _maxSmallSegments;
private boolean _doPartialExpunge;
private int _mergeFactor;
private boolean _useCompoundFile;
private int _maxMergeDocs;
public MergePolicyParams()
_useCompoundFile = false;
_doPartialExpunge = false;
_maxMergeDocs = LogMergePolicy.DEFAULT_MAX_MERGE_DOCS;
public String toString()
StringBuffer sb = new StringBuffer();
sb.append("useCompoundFile: ").append(_useCompoundFile);
sb.append(", doPartialExpunge: ").append(_doPartialExpunge);
sb.append(", numLargeSegments: ").append(_numLargeSegments);
sb.append(", maxSmallSegments: ").append(_maxSmallSegments);
sb.append(", mergeFactor: ").append(_mergeFactor);
sb.append(", maxMergeDocs: ").append(_maxMergeDocs);
return sb.toString();
public synchronized void setNumLargeSegments(int numLargeSegments)
if (numLargeSegments < 2)
log.warn("numLargeSegments cannot be less than 2, while " + numLargeSegments + " is requested. Override with 2.");
numLargeSegments = 2;
_numLargeSegments = numLargeSegments;;
public synchronized int getNumLargeSegments()
return _numLargeSegments;
public synchronized void setMaxSmallSegments(int maxSmallSegments)
if (maxSmallSegments < getMergeFactor()+1)
log.warn("MergeFactor is " +getMergeFactor() + ". maxSmallSegments is requested to be: "
+ maxSmallSegments + ". Override with mergeFactor + 1, since maxSmallSegments has to be greater than mergeFactor.");
maxSmallSegments = getMergeFactor() + 1;
_maxSmallSegments = maxSmallSegments;;
public synchronized int getMaxSmallSegments()
return _maxSmallSegments;
public synchronized void setPartialExpunge(boolean doPartialExpunge)
_doPartialExpunge = doPartialExpunge;;
public synchronized boolean getPartialExpunge()
return _doPartialExpunge;
public synchronized void setMergeFactor(int mergeFactor)
if (mergeFactor<2)
log.warn("mergeFactor has to be at least 2. Override " + mergeFactor + " with 2");
mergeFactor = 2;
_mergeFactor = mergeFactor;
if(_maxSmallSegments < getMergeFactor())
log.warn("maxSmallSegments has to be greater than mergeFactor. Override maxSmallSegments to: " + (mergeFactor + 1));
_maxSmallSegments = getMergeFactor() + 1;
public synchronized int getMergeFactor()
return _mergeFactor;
public synchronized void setMaxMergeDocs(int maxMergeDocs)
_maxMergeDocs = maxMergeDocs;;
public synchronized int getMaxMergeDocs()
return _maxMergeDocs;
public synchronized void setUseCompoundFile(boolean useCompoundFile)
_useCompoundFile = useCompoundFile;;
public synchronized boolean isUseCompoundFile()
return _useCompoundFile;