package dovetaildb.bagindex;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.channels.FileChannel;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.ListIterator;
import java.util.Stack;
import dovetaildb.bytes.AbstractBytes;
import dovetaildb.bytes.ArrayBytes;
import dovetaildb.bytes.Bytes;
import dovetaildb.bytes.CompoundBytes;
import dovetaildb.bytes.MaterializedBytes;
import dovetaildb.bytes.SlicedBytes;
import dovetaildb.querynode.QueryNode;
import dovetaildb.querynode.RangeQueryNode;
import dovetaildb.scan.LiteralScanner;
import dovetaildb.scan.Scanner;
import dovetaildb.score.ConstantScorer;
import dovetaildb.score.Scorer;
import dovetaildb.store.BytesInterface;
import dovetaildb.store.ChunkedMemoryMappedFile;
import dovetaildb.store.VarPosition;
import dovetaildb.util.Pair;
import dovetaildb.util.PubliclyCloneable;
import dovetaildb.util.Util;
public class ImmutableBagIndex extends BagIndex {
/*
* Term tree alternative:
* Term table is
* An ordered list of Terms
* Term is:
* 1 byte of term content
* pointer to sub term table
* "Segment Push" (pointer to doc id list + count)
*
* Doc is:
* Doc id delta
* Partial term
* (not including term tree already traversed and
* not including suffix covered by subterms, if any)
* Segment push is:
* doc id list offset pointer
* document count
*/
/**
PART 1 page:
VInt header: bottom bit indicates whether it's a segment push + part change or a part change only, second to bottom indictes whether
the part change applies only to terms terminating at this value. value indicates number of consumed value bytes
<value>
part change: backwards offset and initial docId
OPTIONAL: segment push: backwards offset and term cap (num bytes and value)
PART 2 page:
VInt docid increment, bottom three bits indicate whether value is inline (and its length), or two flags for whether value is external or this is a segment push
is segment push (docid increment is applied prior to push and a VInts for
the backwards offset, bottom bit = 1 indicates there is an increment cap vint which
immediately follows.
Increment cap for the push is inclusive, so a value of 0 means pull only one entry.
the next docid is the increment in addition to the cap if present, otherwise it's
an increment added to the frst vint in this record
if it has an external value, two VInts follow, for backward offset and length
if it has an inline value, the value bytes follow inline
*/
protected BytesInterface data;
protected BytesInterface header;
protected long maxDocId;
protected String homeDir;
protected long version;
private long rootPos;
// NEW DESIGN
/*
*
*/
static class DocTerm {
long docId;
Bytes term;
}
static class Edit extends DocTerm implements Comparable<DocTerm> {
boolean isDelete;
public Edit(long docId, Bytes term, boolean isDelete) {
this.docId = docId;
this.term = term;
this.isDelete = isDelete;
}
public int compareTo(DocTerm o) {
long ret = docId - o.docId;
if (ret == 0) return term.compareTo(o.term);
else return (ret > 0) ? 1 : -1;
}
public String toString() {
return "Edit("+docId+","+term+","+(isDelete?"del":"ins")+")";
}
}
static interface Rec extends Cloneable {
public Rec next();
public Rec down();
public Rec clone();
public long compareTo(DocTerm docTerm);
public long cumulativeCount();
public Bytes getPrefix();
public Bytes getSuffix();
}
static interface DocRec extends Rec {
public long getDocId();
public DocRec next();
public DocRec down();
public DocRec clone();
public long getDownCt();
}
static interface TermRec extends Rec {
public DocRec getDocList();
public long getDocListCap();
public TermRec next();
public TermRec down();
public TermRec clone();
}
/*
private class MaterializedDocRec implements DocRec {
VarPosition pos;
long docId;
long segPos, segCt;
MaterializedBytesPair prefix;
MaterializedBytes suffix;
public MaterializedDocRec(long position) {
pos = new VarPosition(position);
read();
}
private void read() {
// do it
}
public long compareTo(DocTerm docTerm) {
long delta = docId - docTerm.docId;
if (delta != 0) return delta;
return new CompoundBytes(prefix,suffix).compareTo(docTerm.term);
}
public long cumulativeCount() { return segCt+1; }
public Bytes getPrefix() { return prefix; }
public Bytes getSuffix() { return suffix; }
public long getDocId() { return docId; }
public DocRec next() {
read();
return this;
}
public DocRec down() {
return new MaterializedDocRec(segPos);
}
}
*/
static final class BoundedDocRec implements DocRec {
long ct;
DocRec dr;
public BoundedDocRec(DocRec dr, long ct) {
this.dr=dr;
this.ct=ct;
}
public boolean stopsHere() { return dr.cumulativeCount() >= ct; }
public MemDocRec getMemDocRec() {
if (! stopsHere()) {
return new MemDocRec(dr);
} else {
for(BoundedDocRec sub=down(); sub!=null; sub=sub.next()) {
if (sub.stopsHere()) {
MemDocRec r = sub.getMemDocRec();
r.setDown(dr.down());
r.setDownCt(ct-1);
return r;
}
}
throw new RuntimeException("NOT HANDLED");
}
}
public BoundedDocRec clone() {
return new BoundedDocRec(dr.clone(), ct);
}
public long getDocId() { return dr.getDocId(); }
public long getDownCt() {
long downct = dr.getDownCt();
return (ct > downct) ? downct : ct;
}
public long compareTo(DocTerm docTerm) { return dr.compareTo(docTerm); }
public long cumulativeCount() { return dr.cumulativeCount(); }
public Bytes getPrefix() { return dr.getPrefix(); }
public Bytes getSuffix() { return dr.getSuffix(); }
public BoundedDocRec down() {
long downct = getDownCt();
DocRec down = dr.down();
if (down == null || downct < 1) return null;
return new BoundedDocRec(down, downct);
}
public String toString() {return "BoundedDocRec"; }
public BoundedDocRec next() {
ct -= dr.cumulativeCount();
if (ct <= 0) return null;
dr = dr.next();
if (dr == null) return null;
while(ct < dr.cumulativeCount()) {
// does not include this node
dr = dr.down();
}
return this;
}
}
static abstract class MemRec implements Rec {
Bytes prefix,suffix;
public abstract void setNext(Rec r);
public abstract void setDown(Rec r);
public abstract void setDownCt(long ct);
public Bytes getPrefix() { return prefix; }
public Bytes getSuffix() { return suffix; }
public MemRec clone() {
try {
return (MemRec)super.clone();
} catch (CloneNotSupportedException e) {
throw new RuntimeException(e);
}
}
}
static final class MemDocRec extends MemRec implements DocRec {
DocRec next, down;
long docId, downCt;
public MemDocRec(Bytes prefix, Bytes suffix, long docId, DocRec next) {
this(prefix, suffix, docId, next, null, 0);
}
public MemDocRec(Bytes prefix, Bytes suffix, long docId, DocRec next, DocRec down, long downCt) {
this.prefix = prefix;
this.suffix = suffix;
this.docId = docId;
this.next = next;
this.down = down;
this.downCt = downCt;
}
public MemDocRec(DocRec cur) {
// clone
DocRec temp = cur.clone();
this.next = temp.next();
this.down = cur.down();
this.docId = cur.getDocId();
this.downCt = cur.getDownCt();
this.prefix = cur.getPrefix();
this.suffix = cur.getSuffix();
}
public void setNext(Rec r) {this.next = (DocRec)r; }
public long getDocId() { return this.docId; }
public long cumulativeCount() { return downCt + 1; }
public DocRec down() { return down; }
public DocRec next() { return next; }
public long compareTo(DocTerm docTerm) {
long delta = docId - docTerm.docId;
if (delta != 0) return delta;
return new CompoundBytes(prefix,suffix).compareTo(docTerm.term);
}
public void setDown(Rec r) { this.down = (DocRec)r; }
public long getDownCt() { return this.downCt; }
public void setDownCt(long ct) { this.downCt = ct; }
public String toString() {
String s = System.identityHashCode(this)+":"+prefix+"@"+System.identityHashCode(prefix)+" "+suffix+" doc"+this.getDocId()+" ";
if (down!=null) {
s += "dwn("+this.downCt+")->"+System.identityHashCode(down) + " ";
}
s += "nxt->"+System.identityHashCode(next);
return s;
}
public MemDocRec clone() { return (MemDocRec)super.clone(); }
}
static final class MemTermRec extends MemRec implements TermRec {
TermRec next, down;
DocRec docs;
long docCt, downCt;
Bytes prefix,suffix;
public MemTermRec(Bytes prefix, Bytes suffix, TermRec next,
DocRec docs, long docCt) {
this(prefix, suffix, next, docs, docCt, null, 0);
}
public MemTermRec(Bytes prefix, Bytes suffix, TermRec next,
DocRec docs, long docCt,
TermRec down, long downCt) {
this.prefix = prefix;
this.suffix = suffix;
this.next = next;
this.docs = docs;
this.docCt = docCt;
this.down = down;
this.downCt = downCt;
}
public void setNext(Rec r) {this.next = (TermRec)r; }
public long cumulativeCount() { return docCt + 1; }
public TermRec down() { return down; }
public TermRec next() { return next; }
public long compareTo(DocTerm docTerm) {
return new CompoundBytes(prefix,suffix).compareTo(docTerm.term);
}
public DocRec getDocList() { return docs; }
public long getDocListCap() { return docCt; }
public void setDown(Rec r) { this.down = (TermRec)r; }
public void setDownCt(long ct) { this.docCt = ct; }
public MemTermRec clone() { return (MemTermRec)super.clone(); }
}
static class RecBuffer<T extends MemRec> {
T head = null;
T tail = null;
long ct = 0;
public void append(T seg) {
ct += seg.cumulativeCount();
seg.setNext(null);
if (tail == null) {
head = tail = seg;
} else {
tail.setNext(seg);
tail = seg;
}
}
public long cumulativeCount() { return ct; }
public void extend(RecBuffer<T> buffer) {
T otherHead = buffer.head;
if (otherHead == null) return;
if (tail == null) {
head = buffer.head;
tail = buffer.tail;
} else {
tail.setNext(buffer.head);
tail = buffer.tail;
}
ct += buffer.ct;
}
public void clear() {
this.ct=0;
this.head=this.tail=null;
}
public String toString() {
StringBuffer b = new StringBuffer("RecBuffer\n");
T h = head;
LinkedList<T> todo = new LinkedList<T>();
HashSet<T> seen = new HashSet<T>();
todo.add(head);
while(! todo.isEmpty()) {
h = todo.removeFirst();
b.append("{\n");
while(h != null) {
if (seen.contains(h)) break;
seen.add(h);
b.append(h.toString());
b.append("\n");
Rec down = h.down();
if (down != null && !seen.contains(down))
todo.addLast((T)down);
h = (T)h.next();
}
b.append("}\n");
}
return b.toString();
}
}
// public static RecBuffer<MemTermRec> applyTermEdits(List<Edit> edits, TermRec head) {
// //TODO
// }
public static String docListToString(BoundedDocRec cur, String indentPrefix) {
StringBuffer b = new StringBuffer();
while(cur != null) {
BoundedDocRec down = cur.down();
if (down != null) {
b.append(docListToString(down, indentPrefix+" "));
}
b.append(indentPrefix + cur.dr);
b.append('\n');
cur = cur.next();
}
return b.toString();
}
public static RecBuffer<MemDocRec> applyDocEdits(List<Edit> edits, DocRec head, long ct) {
BoundedDocRec bhead = (head==null) ? null : new BoundedDocRec(head, ct);
RecBuffer<MemDocRec> results = new RecBuffer<MemDocRec>();
applyDocEdits(edits, bhead, results);
return results;
}
public static void applyDocEdits(List<Edit> edits, BoundedDocRec head, RecBuffer<MemDocRec> result) {
// prereq: edits are in doc id, then term order
int editLength = edits.size();
Bytes prefix = (head == null) ? ArrayBytes.EMPTY_BYTES : head.getPrefix();
int prefixLen = prefix.getLength();
BoundedDocRec cur = head;
int editIdx = 0;
Edit edit = null;
if (!edits.isEmpty()) edit = edits.get(0);
while(edit != null && cur != null) {
long cmp = cur.compareTo(edit);
if (cmp < 0) {
result.append(new MemDocRec(cur.dr));
cur = cur.next();
} else {
if (cur.down() == null) {// leaf node
if (edit.isDelete) {
if (cmp != 0) throw new RuntimeException("Unexpected");
cur = cur.next();
} else {
MemDocRec newRec = new MemDocRec(prefix,
new SlicedBytes(edit.term,prefixLen),
edit.docId, null, null, 0);
result.append(newRec);
}
editIdx++;
} else { // means it's a segment push
BoundedDocRec subSegment = cur.down();
int startIdx = editIdx;
for(;editIdx<editLength; editIdx++) {
edit = edits.get(editIdx);
cmp = cur.compareTo(edit);
if (cmp <= 0) break;
}
applyDocEdits(edits.subList(startIdx, editIdx), subSegment, result);
if (edit.isDelete && cmp == 0) {
// do not add tail
editIdx++;
} else {
MemDocRec tail = new MemDocRec(cur.dr);
tail.down = null;
tail.downCt = 0;
result.append(tail);
}
cur = cur.next();
}
edit = (editIdx < edits.size()) ? edits.get(editIdx) : null;
}
}
while (cur != null) {
result.append(new MemDocRec(cur.dr));
cur = cur.next();
}
while (editIdx < edits.size()) {
edit = edits.get(editIdx++);
MemDocRec newRec = new MemDocRec(prefix,
new SlicedBytes(edit.term,prefixLen),
edit.docId, null, null, 0);
result.append(newRec);
}
}
public static <T extends MemRec> RecBuffer<T> balance(RecBuffer<T> buffer, int leafSize) {
long ct = buffer.cumulativeCount();
if (ct < leafSize) return buffer;
long targetLen = (long)Math.sqrt(ct);
// if (targetLen < leafSize/2) targetLen = leafSize/2;
RecBuffer<T> toWrite = new RecBuffer<T>();
RecBuffer<T> accumulator = new RecBuffer<T>();
long accumLen = 0;
T cur = buffer.head;
while (cur!=null) {
T clone = (T)cur.clone();
cur = (T)cur.next();
if (cur != null) {
accumulator.append(clone);
accumLen += cur.cumulativeCount();
}
if (accumLen >= targetLen || cur == null) {
accumLen = 0;
if (accumulator.head == accumulator.tail) {
if (accumulator.head != null) {
toWrite.append(accumulator.head);
accumulator.clear();
}
} else {
long accumCt = accumulator.cumulativeCount();
RecBuffer<T> segment = balance(accumulator, leafSize);
T head = segment.head;
T tail = segment.tail;
tail = (T)tail.clone();
tail.setDown(head);
tail.setDownCt(accumCt-1);
toWrite.append(tail);
}
accumulator = new RecBuffer<T>();
}
if (cur == null) {
toWrite.append(clone);
}
}
return toWrite;
//toWrite.write(pos, ImmutableBagIndex.this);
}
public static void TERM_VER_applyDocEdits(List<Edit> edits, BoundedDocRec head, RecBuffer<MemDocRec> result) {
// prereq: edits are in doc id, then term order
int editLength = edits.size();
Bytes prefix = (head == null) ? ArrayBytes.EMPTY_BYTES : head.getPrefix();
int prefixLen = prefix.getLength();
BoundedDocRec cur = head;
int editIdx = 0;
Edit edit = null;
if (!edits.isEmpty()) edit = edits.get(0);
while(edit != null && cur != null) {
long cmp = cur.compareTo(edit);
if (cmp < 0) {
result.append(new MemDocRec(cur.dr));
cur = cur.next();
} else {
if (cur.down() == null) {// leaf node
if (edit.isDelete) {
if (cmp != 0) throw new RuntimeException("Unexpected");
cur = cur.next();
} else {
MemDocRec newRec = new MemDocRec(prefix,
new SlicedBytes(edit.term,prefixLen),
edit.docId, null, null, 0);
result.append(newRec);
}
editIdx++;
} else { // means it's a segment push
BoundedDocRec subSegment = cur.down();
int startIdx = editIdx;
for(;editIdx<editLength; editIdx++) {
edit = edits.get(editIdx);
cmp = cur.compareTo(edit);
if (cmp <= 0) break;
}
applyDocEdits(edits.subList(startIdx, editIdx), subSegment, result);
if (edit.isDelete && cmp == 0) {
// do not add tail
editIdx++;
} else {
MemDocRec tail = new MemDocRec(cur.dr);
tail.down = null;
tail.downCt = 0;
result.append(tail);
}
cur = cur.next();
}
edit = (editIdx < edits.size()) ? edits.get(editIdx) : null;
}
}
while (cur != null) {
result.append(new MemDocRec(cur.dr));
cur = cur.next();
}
while (editIdx < edits.size()) {
edit = edits.get(editIdx++);
MemDocRec newRec = new MemDocRec(prefix,
new SlicedBytes(edit.term,prefixLen),
edit.docId, null, null, 0);
result.append(newRec);
}
}
@Override
public void close() {
data = null;
header = null;
}
private static final byte[] ZERO_BYTES = new byte[0];
protected void writeNewFile(String filename, long size) {
try {
File file = new File(homeDir + File.separatorChar + filename);
RandomAccessFile raf = new RandomAccessFile(file, "rw");
raf.setLength(size);
raf.close();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
protected BytesInterface openFile(String filename) {
try {
File file = new File(homeDir + File.separatorChar + filename);
RandomAccessFile raf = new RandomAccessFile(file, "rw");
FileChannel channel = raf.getChannel();
return ChunkedMemoryMappedFile.mapFile(channel);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
protected void reopen() {
if (! new File(homeDir + File.separatorChar + "header").exists()) {
// create new
version = 0;
maxDocId = 0;
rootPos = 0;
writeNewFile("header", 10*8);
writeNewFile("data", 0);
forceHeader();
}
header = openFile("header");
parseHeader();
data = openFile("data");
}
protected void parseHeader() {
version = header.getLong(0);
maxDocId = header.getLong(1);
rootPos = header.getLong(2);
}
protected void forceHeader() {
header.putLong(0, version);
header.putLong(1, maxDocId);
header.putLong(2, rootPos);
header.force();
}
@Override
public long getCurrentRevNum() {
return maxDocId;
}
@Override
public String getHomedir() {
return homeDir;
}
@Override
public void setHomedir(String homeDir) {
this.homeDir = homeDir;
reopen();
}
@Override
public RangeQueryNode getRange(byte[] prefix, byte[] term1, byte[] term2, boolean isExclusive1, boolean isExclusive2, long revNum) {
return new ImmutableBagRangeQuery();
}
@Override
public QueryNode getTerm(byte[] term, long revNum) {
return getRange(term, null, null, true, true, revNum);
}
@Override
public long commitNewRev(Collection<EditRec> edits) {
return 0;
}
}
/*
private static interface Rec {
public long cumulativeCount();
public boolean isSegmentPush();
public StackFrameBuffer pullUp(BytesInterface bi);
Rec pushDown(StackFrameBuffer buffer, BytesInterface bi, long targetPageLength);
StackFrameBuffer expandEdits(BytesInterface bi, StackFrameBuffer buffer, List<TermEdit> edits);
public Bytes getSuffixBytes();
public void setSuffixBytes(Bytes bytes);
public long getPrefixPosition();
public void setPrefixPosition(long position);
// public long getVintValue();
// public void setVintValue(long val);
}
*/
// if (cmp == 0) {
// DocRec subSegment = cur.down();
// if (subSegment != null) {// means it's a segment push
// int startIdx = editIdx;
// for(;editIdx<editLength; editIdx++) {
// Edit fwdEdit = edits.get(editIdx);
// if (cur.compareTo(fwdEdit)>0) break;
// }
// long subCt = cur.cumulativeCount();
// if (ct < subCt) subCt = ct;
// applyDocEdits(edits.subList(startIdx, editIdx), subSegment, subCt);
// edits.addAll(editIdx, )
// editIdx--;
// } else { // leaf node
// if (edit.isDelete) {
// if (prev == null) head.setNext(cur.next());
// else prev.setNext(cur.next());
// } else {
// // wierd; attempted to add an existing doc term
// // do nothing?
// }
// }
// } else { // cmp < 0
// MemDocRec newRec = new MemDocRec();
// newRec.setNext(cur);
// prev.setNext(newRec);
// }
// head = head.next();
// }
// }
/*
private static final class EditedRec implements Rec {
List<TermEdit> edits;
long bytesPrefixPosition, vintValue;
Bytes byteSuffix;
StackFrameBuffer pushedSegment;
public EditedRec(List<TermEdit> edits, long vintValue, long bytesPrefixPosition, Bytes byteSuffix, StackFrameBuffer pushedSegment) {
this.edits = edits;
this.vintValue = vintValue;
this.bytesPrefixPosition = bytesPrefixPosition;
this.byteSuffix = byteSuffix;
this.pushedSegment = pushedSegment;
}
public long cumulativeCount() {
long count = pushedSegment.cumulativeCount();
for(TermEdit edit : edits) {
if (edit.isInsertion) count++;
else count--;
}
return count;
}
public long getPrefixPosition() {
return bytesPrefixPosition;
}
public Bytes getSuffixBytes() {
return byteSuffix;
}
public long getVintValue() {
return vintValue;
}
public boolean isSegmentPush() {
return true;
}
public void setPrefixPosition(long position) {
this.bytesPrefixPosition = position;
}
public void setSuffixBytes(Bytes bytes) {
this.byteSuffix = bytes;
}
public void setVintValue(long val) {
this.vintValue = val;
}
}
private static final class GenericRec implements Rec, PubliclyCloneable {
// term page: backwards offset into top doc page
// doc page: doc id increment from previous value
long vintValue;
// term page: pointer to the parent entry
// doc page: pointer to term page's term data
long bytePrefixPosition;
// term page: term prefix covering doc pages and/or segment pushes
// doc page: term suffix for current doc, should be zero on segment pushes
Bytes byteSuffix;
long segPushPosition, segPushCap;
public GenericRec() {}
public GenericRec(
BytesInterface bi,
long vintValue,
long bytePrefixPosition,
Bytes byteSuffix,
long segPushPosition,
long segPushCap) {
this.set(bi, vintValue, bytePrefixPosition, byteSuffix, segPushPosition, segPushCap);
}
public GenericRec(BytesInterface bi,
long vintValue,
long bytePrefixPosition,
long byteSuffixPosition,
long byteSuffixLength,
long segPushPosition,
long segPushCap) {
this.set(bi, vintValue, bytePrefixPosition, byteSuffixPosition, byteSuffixLength, segPushPosition, segPushCap);
}
public void set(
BytesInterface bi,
long vintValue,
long bytePrefixPosition,
long byteSuffixPosition, long byteSuffixLength,
long segPushPosition, long segPushCap) {
this.vintValue = vintValue;
this.bytePrefixPosition = bytePrefixPosition;
this.byteSuffix = new MaterializedBytes(bi, byteSuffixPosition, (int)byteSuffixLength);
this.segPushPosition = segPushPosition;
this.segPushCap = segPushCap;
}
public void set(
BytesInterface bi,
long vintValue,
long bytePrefixPosition,
Bytes byteSuffix,
long segPushPosition, long segPushCap) {
this.vintValue = vintValue;
this.bytePrefixPosition = bytePrefixPosition;
this.byteSuffix = byteSuffix;
this.segPushPosition = segPushPosition;
this.segPushCap = segPushCap;
}
@Override
public boolean isSegmentPush() { return segPushPosition != 0; }
@Override
public long cumulativeCount() {
if (isSegmentPush()) return segPushCap;
else return 1;
}
public GenericRec clone() {
try {
return (GenericRec)super.clone();
} catch (CloneNotSupportedException e) {
throw new RuntimeException(e);
}
}
public long getPrefixPosition() {
return this.bytePrefixPosition;
}
public Bytes getSuffixBytes() {
return byteSuffix;
}
public void setSuffixBytes(Bytes bytes) {
this.byteSuffix = bytes;
}
public long getVintValue() {
return this.vintValue;
}
public void setPrefixPosition(long position) {
this.bytePrefixPosition = position;
}
public void setVintValue(long val) {
this.vintValue = val;
}
public String toString() {
byte[] buffer = new byte[byteSuffix.getLength()];
byteSuffix.getBytes(buffer, 0);
return "GRec("+vintValue+", "+bytePrefixPosition+":"+Util.bytesAsString(buffer)
+", "+segPushPosition+":"+segPushCap+")";
}
}
static abstract class PageType {
abstract GenericRec pushDown(StackFrameBuffer buffer, BytesInterface bi, long targetPageLength);
abstract StackFrameBuffer pullUp(BytesInterface bi, GenericRec rec);
abstract StackFrameBuffer expandEdits(BytesInterface bi, StackFrameBuffer buffer, List<TermEdit> edits);
}
static final class TermPageType extends PageType {
@Override
StackFrameBuffer pullUp(BytesInterface bi, GenericRec parent) {
StackFrameBuffer buffer = readGenericSegment(bi, parent.segPushPosition, parent.segPushCap);
Bytes suffix = parent.byteSuffix;
if (suffix != null) {
long newHeaderPosition = parent.bytePrefixPosition;
for(Rec rec : buffer.getContents()) {
rec.setPrefixPosition(newHeaderPosition);
rec.setSuffixBytes(AbstractBytes.prependBytes(rec.getSuffixBytes(), suffix));
}
}
return buffer;
}
@Override
StackFrameBuffer expandEdits(BytesInterface bi, StackFrameBuffer buffer, List<TermEdit> edits) {
long commonPrefixPosition = buffer.contents.get(0).getPrefixPosition();
TermEdit curEdit = edits.get(0);
int numEdits = edits.size();
int editIdx = 1;
StackFrameBuffer newBuffer = new StackFrameBuffer();
for(Rec rec : buffer.getContents()) {
GenericRec genericRec = (GenericRec)rec;
while(curEdit != null) {
Bytes b1 = curEdit.term;
Bytes b2 = rec.getSuffixBytes();
boolean pastInsertion = b1.compareTo(b2) <= 0;
if (b2.isPrefixOf(b1)) { // inject into
// modify both the summary list, if present (transition to doc id pages)
// and the pushed segment, if present
// pushed segment first (depth first traversal)
long segPushPos = genericRec.segPushPosition;
if (segPushPos != -1) {
ArrayList<TermEdit> subEdits = new ArrayList<TermEdit>();
subEdits.add(curEdit);
while(editIdx < numEdits && b2.isPrefixOf(edits.get(editIdx).term)) {
subEdits.add(edits.get(editIdx++));
}
newBuffer.add(new EditedRec(subEdits, rec.getVintValue(), rec.getPrefixPosition(),
b2, readGenericSegment(bi, genericRec.segPushPosition,
genericRec.segPushCap)));
}
} else if (pastInsertion) { // insert before
GenericRec singletonRec = new GenericRec(bi, curEdit.docId, commonPrefixPosition, null, -1, -1);
long singletonPos = bi.getSize();
writeGenericRec(bi, singletonRec);
Rec termHeader = new GenericRec(bi, -1L, commonPrefixPosition, b1, singletonPos, 1L);
newBuffer.add(termHeader);
} else {
break;
}
if (editIdx < numEdits) {
curEdit = edits.get(editIdx++);
} else {
curEdit = null;
}
}
newBuffer.add(rec);
}
return newBuffer;
}
@Override
GenericRec pushDown(StackFrameBuffer buffer, BytesInterface bi, long targetPageLength) {
Rec firstRec = buffer.contents.get(0);
Bytes firstTermBytes = firstRec.getSuffixBytes();
int smallestDivergentIndex = firstTermBytes.getLength();
for(Rec rec : buffer.contents) {
Bytes termBytes = rec.getSuffixBytes();
int len = termBytes.getLength();
if (len < smallestDivergentIndex) {
smallestDivergentIndex = len;
}
for(int i=0; i<smallestDivergentIndex; i++) {
if (termBytes.get(i) != firstTermBytes.get(i)) {
smallestDivergentIndex = i;
break;
}
}
}
long commonPrefixPosition = firstRec.getPrefixPosition();
long newBytesPosition = -1;
long newBytesLength = 0;
if (smallestDivergentIndex > 0) {
// copy a new header
byte[] headerBytes = new byte[smallestDivergentIndex];
for(int i=0; i<smallestDivergentIndex; i++) {
headerBytes[i] = firstTermBytes.get(i);
}
newBytesPosition = bi.getSize();
newBytesLength = smallestDivergentIndex;
Bytes newBytes = new ArrayBytes(headerBytes);
writeByteValueNode(bi, commonPrefixPosition, newBytes);
// move internal pointers down
for(Rec rec : buffer.contents) {
rec.setPrefixPosition(newBytesPosition);
rec.setSuffixBytes(AbstractBytes.removePrefixBytes(rec.getSuffixBytes(), (int)newBytesLength));
}
}
long segPushPosition = balanceAndWriteSegment(buffer, bi, this, targetPageLength);
long segPushCap = buffer.cumulativeCount();
long vintValue = -1; // create summary list
GenericRec rec = new GenericRec(bi, vintValue, commonPrefixPosition, newBytesPosition, newBytesLength, segPushPosition, segPushCap);
return rec;
}
}
static final TermPageType TERM_PAGE_TYPE = new TermPageType();
static final class DocPageType extends PageType {
@Override
StackFrameBuffer pullUp(BytesInterface bi, GenericRec parent) {
StackFrameBuffer buffer = readDocSegment(bi, parent.segPushPosition, parent.segPushCap, parent.vintValue);
Rec firstRec = buffer.getContents().get(0);
((GenericRec)firstRec).vintValue = parent.vintValue;
return buffer;
}
// <edits> is assumed to be doc id sorted
@Override
StackFrameBuffer expandEdits(BytesInterface bi, StackFrameBuffer buffer, List<TermEdit> edits) {
long commonPrefixPosition = buffer.contents.get(0).getPrefixPosition();
TermEdit curEdit = edits.get(0);
int numEdits = edits.size();
int editIdx = 1;
Rec lastRec = null;
StackFrameBuffer newBuffer = new StackFrameBuffer();
for(Rec rec : buffer.getContents()) {
GenericRec genericRec = (GenericRec)rec;
while(curEdit != null) {
long editDocId = curEdit.docId;
long curDocId = rec.getVintValue();
if (editDocId <= curDocId) {
if (lastRec.isSegmentPush()) {
if (editDocId < curDocId) {
ArrayList<TermEdit> subEdits = new ArrayList<TermEdit>();
subEdits.add(curEdit);
while(editIdx < numEdits && edits.get(editIdx).docId < curDocId) {
subEdits.add(edits.get(editIdx++));
}
newBuffer.add(new EditedRec(subEdits, rec.getVintValue(), commonPrefixPosition,
null, readDocSegment(bi, genericRec.segPushPosition,
genericRec.segPushCap, curDocId)));
}
}
if (editDocId < curDocId ||
(editDocId==curDocId &&
curEdit.term.compareTo(rec.getSuffixBytes()) > 0)) {
newBuffer.add(new GenericRec(bi, editDocId, commonPrefixPosition, curEdit.term, -1, 0));
if (editIdx < numEdits) {
curEdit = edits.get(editIdx++);
} else {
curEdit = null;
}
}
}
}
newBuffer.add(rec);
lastRec = rec;
}
return newBuffer;
}
@Override
GenericRec pushDown(StackFrameBuffer buffer, BytesInterface bi, long targetPageLength) {
long segPushPosition = balanceAndWriteSegment(buffer, bi, this, targetPageLength);
long segPushCap = buffer.cumulativeCount();
long parentVintValue = buffer.contents.get(0).getVintValue();
GenericRec rec = new GenericRec();
rec.set(bi, parentVintValue, -1, -1, 0, segPushPosition, segPushCap);
return rec;
}
}
static final DocPageType DOC_PAGE_TYPE = new DocPageType();
static final class StackFrameBuffer {
ArrayList<Rec> contents = new ArrayList<Rec>();
long cumulativeCount = 0L;
byte[] byteSuffix;
long bytePrefixPosition;
public void add(Rec r) {
contents.add(r);
cumulativeCount += r.cumulativeCount();
}
public ArrayList<Rec> getContents() { return contents; }
public boolean isSegmentPush() { return true; }
public long cumulativeCount() { return cumulativeCount; }
public ListIterator<Rec> getListIterator() {
return contents.listIterator();
}
public boolean isEmpty() {
return contents.isEmpty();
}
public int size() {
return contents.size();
}
public void clear() {
contents.clear();
this.cumulativeCount = 0L;
}
}
static final class TermEdit {
Bytes term;
long docId;
boolean isInsertion;
}
static final Comparator<TermEdit> TERM_EDIT_COMPARATOR = new Comparator<TermEdit>() {
public int compare(TermEdit t1, TermEdit t2) {
Bytes bi1 = t1.term;
Bytes bi2 = t2.term;
return bi1.compareTo(bi2);
}
};
static final Comparator<TermEdit> DOC_EDIT_COMPARATOR = new Comparator<TermEdit>() {
public int compare(TermEdit t1, TermEdit t2) {
long cmp = (t1.docId - t2.docId);
if (cmp < 0) return -1;
else if (cmp > 0) return 1;
else return 0;
}
};
// static StackFrameBuffer commit(BytesInterface bi, long pos, long cap, List<TermEdit> edits, PageType pageType) {
// if (pageType == TERM_PAGE_TYPE) {
// StackFrameBuffer buffer = readGenericSegment(bi, pos, cap);
// long commonPrefixPosition = buffer.contents.get(0).getPrefixPosition();
// TermEdit curEdit = edits.get(0);
// int numEdits = edits.size();
// int editIdx = 1;
// StackFrameBuffer newBuffer = new StackFrameBuffer();
// Rec lastRec = null;
// for(Rec rec : buffer.getContents()) {
// GenericRec genericRec = (GenericRec)rec;
// while(curEdit != null) {
// Bytes b1 = curEdit.bi;
// Bytes b2 = rec.getSuffixBytes();
// boolean pastInsertion = b1.compareTo(b2) <= 0;
// if (b2.isPrefixOf(b1)) { // inject into
// // modify both the summary list, if present (transition to doc id pages)
// // and the pushed segment, if present
//
// // pushed segment first (depth first traversal)
// long segPushPos = genericRec.segPushPosition;
// if (segPushPos != -1) {
// ArrayList<TermEdit> subEdits = new ArrayList<TermEdit>();
// subEdits.add(curEdit);
// while(editIdx < numEdits && b2.isPrefixOf(edits.get(editIdx).bi)) {
// subEdits.add(edits.get(editIdx));
// }
// StackFrameBuffer subBuffer = commit(bi, segPushPos, genericRec.segPushCap, TERM_PAGE_TYPE, subEdits);
// pageType.pullUp(bi, rec);
// newBuffer.contents.addAll(subBuffer.contents);
// }
// } else if (pastInsertion) { // insert before
// Rec docIdEntry = new UnmaterializedRec(curEdit.docId, commonPrefixPosition, b1, null);
// Rec termHeader = new UnmaterializedRec(??, commonPrefixPosition, b1, null);
// newBuffer.add();
// } else {
// break;
// }
// if (editItr.hasNext()) {
// curEdit = editItr.next();
// } else {
// curEdit = null;
// }
// }
// newBuffer.add(rec);
// }
// } else {
// StackFrameBuffer buffer = readDocSegment(bi, pos, cap);
// long commonPrefixPosition = buffer.contents.get(0).getPrefixPosition();
//
// }
// }
static long balanceAndWriteSegment(StackFrameBuffer buffer, BytesInterface bi, PageType pageType, long targetPageLength) {
List<Rec> contents = buffer.getContents();
StackFrameBuffer output = new StackFrameBuffer();
long targetSublength = targetPageLength / buffer.cumulativeCount();
if (targetSublength < 1) targetSublength = 1;
StackFrameBuffer collapseQueue = new StackFrameBuffer();
for(int i=0; i < contents.size(); i++) {
GenericRec curRec = (GenericRec)contents.get(i);
long curCount = curRec.cumulativeCount();
long curDelta = Math.abs(curCount - targetSublength);
if (! collapseQueue.isEmpty()) {
long prevCount = collapseQueue.cumulativeCount();
long deltaOfExistingPair = curDelta + Math.abs(prevCount-targetSublength);
long deltaOfCombinedPair = Math.abs(prevCount+curCount - targetSublength);
if (deltaOfExistingPair > deltaOfCombinedPair) { // we should merge it
collapseQueue.add(curRec);
continue;
}
// we decided not to merge, cut off the segment and use it:
if (collapseQueue.size() == 1) {
output.add(collapseQueue.getContents().get(0));
} else {
GenericRec pushRec = pageType.pushDown(collapseQueue, bi, targetPageLength);
output.add(pushRec);
}
collapseQueue.clear();
}
if (curCount > targetSublength) { // consider a split
long avgSizeOfExpandedItems = curCount/targetPageLength;
long deltaOfEachExpandedItem = Math.abs(avgSizeOfExpandedItems - targetSublength);
long deltaOfAllExpandedItems = deltaOfEachExpandedItem * targetPageLength;
if ( curDelta > deltaOfAllExpandedItems ) { // we should split
StackFrameBuffer subBuf = pageType.pullUp(bi, curRec);
contents.remove(i);
contents.addAll(i, subBuf.contents);
continue;
}
} else {
collapseQueue.add(curRec);
}
}
return writeGenericSegment(bi, output);
}
static void readTermRec(BytesInterface bytes, VarPosition pos, GenericRec rec) {
readGenericRec(bytes, pos, rec);
}
static void readDocRec(BytesInterface bytes, VarPosition pos, GenericRec rec) {
long prevDocId = rec.vintValue;
readGenericRec(bytes, pos, rec);
rec.vintValue += prevDocId;
}
static void readGenericRec(BytesInterface bytes, VarPosition pos, GenericRec rec) {
long bytePrefixPosition, vintValue;
long segPushPosition = -1;
long segPushCap = 0;
long byteSuffixPosition = -1;
long byteSuffixLength = 0;
vintValue = bytes.getVLong(pos);
boolean hasByteValue = (vintValue & 0x02L) != 0;
boolean hasSegmentPush = (vintValue & 0x01L) != 0;
vintValue >>= 2;
if (hasByteValue) {
long start = pos.position;
bytePrefixPosition = bytes.getVLong(pos);
boolean hasSuffix = (bytePrefixPosition & 0x01L) != 0;
bytePrefixPosition = start - (bytePrefixPosition >> 1);
if (hasSuffix) {
byteSuffixLength = bytes.getVLong(pos);
byteSuffixPosition = pos.position;
} else {
byteSuffixPosition = -1;
}
} else {
bytePrefixPosition = -1;
}
if (hasSegmentPush) {
long start = pos.position;
segPushPosition = start - bytes.getVLong(pos);
segPushCap = bytes.getVLong(pos);
} else {
segPushPosition = -1;
}
rec.set(bytes, vintValue, bytePrefixPosition, byteSuffixPosition, byteSuffixLength, segPushPosition, segPushCap);
}
static void writeByteValueNode(BytesInterface bytes,
long bytePrefixPosition, Bytes suffix) {
bytePrefixPosition = (bytes.getSize() - bytePrefixPosition) << 1;
if (suffix != null) {
bytes.appendVLong(bytePrefixPosition | 0x01L);
int len = suffix.getLength();
bytes.appendVLong(len);
byte[] b = new byte[len];
suffix.getBytes(b, 0);
bytes.appendBytes(b);
} else {
bytes.appendVLong(bytePrefixPosition);
}
}
static void writeGenericRec(BytesInterface bytes, GenericRec rec) {
long vintValue = (bytes.getSize() - rec.vintValue) << 2;
boolean hasByteValue = (rec.bytePrefixPosition != -1);
boolean hasSegmentPush = (rec.segPushPosition != -1);
if (hasByteValue) vintValue |= 0x02;
if (hasSegmentPush) vintValue |= 0x01;
bytes.appendVLong(vintValue);
if (hasByteValue) {
writeByteValueNode(bytes, rec.bytePrefixPosition, rec.byteSuffix);
}
if (hasSegmentPush) {
bytes.appendVLong(bytes.getSize() - rec.segPushPosition);
bytes.appendVLong(rec.segPushCap);
}
}
static StackFrameBuffer readDocSegment(BytesInterface bytes, long position, long cap, long initialDocId) {
VarPosition pos = new VarPosition(position);
if (cap < 1) throw new RuntimeException("cannot have a cap of zero");
StackFrameBuffer buffer = new StackFrameBuffer();
GenericRec rec = new GenericRec();
readDocRec(bytes, pos, rec);
rec.vintValue = initialDocId;
buffer.add(rec);
cap -= rec.cumulativeCount();
while(cap > 0) {
readDocRec(bytes, pos, rec);
buffer.add(rec);
cap -= rec.cumulativeCount();
}
return buffer;
}
static StackFrameBuffer readGenericSegment(BytesInterface bytes, long position, long cap) {
VarPosition pos = new VarPosition(position);
if (cap < 1) throw new RuntimeException("cannot have a cap of zero");
StackFrameBuffer buffer = new StackFrameBuffer();
while(cap > 0) {
GenericRec rec = new GenericRec();
readGenericRec(bytes, pos, rec);
buffer.add(rec);
cap -= rec.cumulativeCount();
}
return buffer;
}
static long writeGenericSegment(BytesInterface bytes, StackFrameBuffer buffer) {
long start = bytes.getSize();
for(Rec rec : buffer.contents) {
writeGenericRec(bytes, (GenericRec)rec);
}
return start;
}
@Override
public void close() {
data = null;
header = null;
}
private static final byte[] ZERO_BYTES = new byte[0];
@Override
public long getCurrentRevNum() {
return maxDocId;
}
@Override
public String getHomedir() {
return homeDir;
}
@Override
public void setHomedir(String homeDir) {
this.homeDir = homeDir;
reopen();
}
protected void writeNewFile(String filename, long size) {
try {
File file = new File(homeDir + File.separatorChar + filename);
RandomAccessFile raf = new RandomAccessFile(file, "rw");
raf.setLength(size);
raf.close();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
protected BytesInterface openFile(String filename) {
try {
File file = new File(homeDir + File.separatorChar + filename);
RandomAccessFile raf = new RandomAccessFile(file, "rw");
FileChannel channel = raf.getChannel();
return ChunkedMemoryMappedFile.mapFile(channel);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
protected void reopen() {
if (! new File(homeDir + File.separatorChar + "header").exists()) {
// create new
version = 0;
maxDocId = 0;
rootPos = 0;
writeNewFile("header", 10*8);
writeNewFile("data", 0);
forceHeader();
}
header = openFile("header");
parseHeader();
data = openFile("data");
}
protected void parseHeader() {
version = header.getLong(0);
maxDocId = header.getLong(1);
rootPos = header.getLong(2);
}
protected void forceHeader() {
header.putLong(0, version);
header.putLong(1, maxDocId);
header.putLong(2, rootPos);
header.force();
}
static final class TraversalStack {
long pos;
long cap;
final TraversalStack parent;
TraversalStack(TraversalStack parent, long pos, long cap) {
this.parent = parent;
this.pos = pos;
this.cap = cap;
}
}
static final class Traversal {
TraversalStack bottom;
VarPosition vp;
long cap;
public Traversal(long pos, long cap) {
bottom = null;
vp = new VarPosition(pos);
this.cap = cap;
}
public void push(long pos, long cap) {
if (this.cap < cap) cap = this.cap;
this.cap -= cap;
bottom = new TraversalStack(bottom, vp.position, this.cap);
vp.position = pos;
this.cap = cap;
}
public void pop() {
vp.position = bottom.pos;
cap = bottom.cap;
bottom = bottom.parent;
}
public boolean read(BytesInterface bi, GenericRec rec) {
readGenericRec(bi, vp, rec);
cap -= rec.cumulativeCount();
if (cap > 0) return true;
while (cap <= 0) {
if (bottom == null) return false;
pop();
}
return true;
}
public boolean readUntilOnePrefixOfOther(BytesInterface bi, GenericRec rec, Bytes target) {
do {
Bytes cur = rec.byteSuffix;
if (target.isPrefixOf(cur) || cur.isPrefixOf(target)) return true;
if (cur.compareTo(target) > 0) return false;
} while(read(bi, rec));
return false;
}
public boolean readUntilBytePrefixOf(BytesInterface bi, GenericRec rec, Bytes target) {
do {
if (rec.byteSuffix.isPrefixOf(target)) return true;
if (rec.byteSuffix.compareTo(target) > 0) return false;
} while(read(bi, rec));
return false;
}
private static final ArrayList<Traversal> NO_TRAVERSALS = new ArrayList<Traversal>();
public ArrayList<Traversal> fetchDocTraversalsInRange(BytesInterface bi, Bytes prefix, Bytes min, Bytes max, boolean minIsExclusive, boolean maxIsExclusive) {
ArrayList<Traversal> results = new ArrayList<Traversal>();
GenericRec rec = new GenericRec();
if (prefix.getLength() > 0) {
if (! readUntilOnePrefixOfOther(bi, rec, prefix)) return NO_TRAVERSALS;
if (rec.byteSuffix.getLength() <= prefix.getLength()) {
prefix = new SlicedBytes(prefix, rec.byteSuffix.getLength());
push(rec.segPushPosition, rec.segPushCap);
return fetchDocTraversalsInRange(bi, prefix, min, max);
} else {
min = new CompoundBytes(prefix, min);
max = new CompoundBytes(prefix, max);
while(true) {
Bytes cmpSlice = new SlicedBytes(rec.byteSuffix, prefix.getLength());
if (cmpSlice.compareTo(max) > 0) break;
if (cmpSlice.compareTo(min) >= 0) {
if (rec.vintValue != -1) { // we have a doc id list
results.add(new Traversal(rec.vintValue, rec.));
}
}
read(bi,rec);
}
}
}
}
}
@Override
public RangeQueryNode getRange(byte[] prefix, byte[] term1, byte[] term2, boolean isExclusive1, boolean isExclusive2, long revNum) {
return new ImmutableBagRangeQuery();
}
@Override
public QueryNode getTerm(byte[] term, long revNum) {
return getRange(term, null, null, true, true, revNum);
}
// static abstract class RecAcceptor {
// public abstract boolean accept(GenericRec rec);
// }
//
// public static final class MultiReadResults {
// long start;
// int numRead, numReadCumulative; // counts are up to, but not including the preRec result
// GenericRec preRec, postRec;
// }
//
// public void readUntil(
// StackFrame frame,
// BytesInterface bi,
// RecAcceptor acceptor,
// GenericRec preRec,
// GenericRec postRec,
// MutableInt numRead,
// MutableInt cumulativeRead) {
//
// }
//
// static final class Traversal {
// GenericRec rec;
// ArrayList<StackFrame> stack;
// }
//
// final class TermRec {
// byte[] term;
// long partChange, segmentPush;
// }
//
// final class DocRec {
// long docId;
// long valueStart;
// int valueSize;
// }
//
// static final class TraversalStack {
// final VarPosition vp = new VarPosition(0);
// final TraversalStack parent;
// long cap;
// TraversalStack(TraversalStack parent, long cap) {
// this.parent = parent;
// this.cap = cap;
// }
// public boolean read(BytesInterface bi, GenericRec rec) {
// readGenericRec(bi, vp, rec);
// cap--;
// return cap >= 0;
// }
// public boolean readUntilBytePrefixOf(BytesInterface bi, GenericRec rec, byte[] target) {
// readGenericRec(bi, vp, rec);
// cap--;
// }
// public long getVLong(BytesInterface bytes) {
// return bytes.getVLong(vp);
// }
// public long getPosition() { return vp.position; }
// public void push(long position, long count) {}
// }
//
// @Override
// public long getCurrentRevNum() {
// return maxDocId;
// }
}
*/