package dovetaildb.bagindex;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import dovetaildb.fileaccessor.MappedFile;
import dovetaildb.fileaccessor.OffsetValueFilePair;
import dovetaildb.fileaccessor.PagedFile;
import dovetaildb.scan.IntegerScanner;
import dovetaildb.scan.Scanner;
import dovetaildb.store.BytesInterface;
import dovetaildb.store.VarPosition;
import dovetaildb.util.Pair;
import dovetaildb.util.Util;
import dovetaildb.util.VIntScanner;
public class OriginalTrieBagIndex extends BagIndex {
/**
* Pages:
* Trie page (link up + summary list + link complete list + 256 links down)
* Single value posting list page (link up + single value + compressed doc id list)
* Multi value posting list page (link prev:4 + link next:4 + compressed doc id list + value list) (as much as fits on one page)
* First offset is the complete initial doc number, subsequent offsets are increments with the low bit reserved for whether the value is different than the previous.
* When low bit is not set, a value pointer VLong follows, the low bit of which indicates whether the value is inline.
* When this low bit is not set, the value is considered an offset into the values file and a Vlong for the value length follows.
* ID terms are suffixed with pointers to remainder of data -> (compressed value page link & offset) list
* Deletions: add a *del:<docid>*(<txn>) field
*
*/
PagedFile pages;
BytesInterface pageBi;
OffsetValueFilePair docs; // docId -> term vints
MappedFile terms;
long maxDocId;
protected enum PageType {TRIE, SINGLE_VAL, MULTI_VAL};
protected final int LEAF_MASK = 0x80000000;
protected final int SINGLE_MASK = 0x80000000;
protected int getPrevLeafPage(int page) {
// (link prev:4 + link next:4 + docs)
return pageBi.getInt(pages.getIntOffestForPage(page));
}
protected int getNextLeafPage(int page) {
// (link prev:4 + link next:4 + docs)
return pageBi.getInt(pages.getIntOffestForPage(page)+1);
}
private void setPrevLeafPage(int page, int prevPage) {
pageBi.putInt(pages.getIntOffestForPage(page), prevPage);
}
private void setNextLeafPage(int page, int nextPage) {
pageBi.putInt(pages.getIntOffestForPage(page+1), nextPage);
}
final class LookedUpTerm {
public final int page, termPrefixLen, parentPage;
public LookedUpTerm(int page, int termPrefixLen, int parentPage) {
this.page = page;
this.termPrefixLen = termPrefixLen;
this.parentPage = parentPage;
}
}
protected LookedUpTerm findLeafForTerm(byte[] term) {
int parentPage = -1;
int page = 0;
int termIdx = 0;
while(termIdx < term.length) {
long offset = pages.getIntOffestForPage(page);
parentPage = page;
page = pageBi.getInt(offset+(term[termIdx++]&0xff));
if ((LEAF_MASK & page) == LEAF_MASK) break;
}
return new LookedUpTerm(page & ~LEAF_MASK, termIdx, parentPage);
}
protected final int pageSize;
protected final int overflowThreshold;
protected final int minRecsPerPage;
/** returns the next page */
private int extendSingleValuedPage(int page) {
int newPage = pages.newPageIndex();
long offset = pages.getIntOffestForPage(page);
pageBi.putInt(offset, newPage);
long newPageByteOffset = pages.getByteOffestForPage(newPage);
// 4 bytes for next page, one byte for the first vint
for(int i=0; i<5; i++) {
pageBi.putByte(newPageByteOffset+i, (byte) 0);
}
return newPage;
}
final class TermInDocRec implements Comparable {
final byte[] term;
final long docId;
final long termValueOffset;
final int termValueLength;
int bump;
public TermInDocRec(byte[] term, long docId) {
this(term, docId, 0);
}
public TermInDocRec(byte[] term, long docId, long termValueOffset) {
this.term = term;
this.docId = docId;
this.termValueOffset = termValueOffset;
this.termValueLength = 0;
this.bump = 0;
}
public TermInDocRec(long docId, long termValueOffset, int termValueLength) {
this.termValueOffset = termValueOffset;
this.termValueLength = termValueLength;
this.docId = docId;
this.term = null;
}
public TermInDocRec(long docId, TermInDocRec prev) {
this.docId = docId;
this.bump = prev.bump;
this.term = prev.term;
this.termValueOffset = prev.termValueOffset;
this.termValueLength = prev.termValueLength;
}
public boolean equals(Object otherObj) {
TermInDocRec o = (TermInDocRec)otherObj;
if (this.term == o.term &&
this.termValueOffset == o.termValueOffset &&
this.termValueLength == o.termValueLength) {
return true;
}
return compareTo(otherObj) == 0;
}
public int compareTo(Object otherObj) {
TermInDocRec other = (TermInDocRec)otherObj;
int cmp = Util.compareBytes(this.term, other.term);
if (cmp == 0) {
cmp = (int)(this.docId - other.docId);
}
return cmp;
}
public int getLength() {
if (term != null) return term.length;
else return termValueLength;
}
public int firstByte() {
if (term == null) {
return term[bump];
} else {
throw new RuntimeException("not yet implemented");
}
}
public byte[] getTerm() {
if (term != null) {
return term;
} else {
throw new RuntimeException("not yet implemented");
}
}
public byte getByteAt(int i) {
if (term != null) {
return term[bump+i];
} else {
return pageBi.getByte(this.termValueOffset+bump+i);
}
}
}
class PagePlan {
ArrayList<TermInDocRec> docs; // list of terminating docs if this is an internal node
// one of these is null, the other is non-null:
PagePlan[] subPlans; // always of length 256 if non-null
public PagePlan() {
docs = new ArrayList<TermInDocRec>();
subPlans = null;
}
public PagePlan(ArrayList<TermInDocRec> docs) {
this.docs = docs;
}
}
private PagePlan generatePlan(List<TermInDocRec> docs) {
// assumes input is sorted
int numDocs = docs.size();
PagePlan plan = new PagePlan();
if ((numDocs <= minRecsPerPage) ||
(docs.get(0).compareTo(docs.get(numDocs-1))==0)) {
// no more splitting required
for(TermInDocRec doc : docs) {
plan.docs.add(doc);
}
} else {
// split
plan.subPlans = new PagePlan[256];
ArrayList<TermInDocRec>[] buckets = new ArrayList[256];
for(TermInDocRec doc : docs) {
int len = doc.getLength();
if (len == 0) {
plan.docs.add(doc);
} else {
int insertIdx = doc.firstByte() & 0xff;
ArrayList<TermInDocRec> bucket = buckets[insertIdx];
if (bucket == null) {
buckets[insertIdx] = bucket = new ArrayList<TermInDocRec>();
}
bucket.add(doc);
}
}
int i=-1;
for(ArrayList<TermInDocRec> bucket : buckets) {
if (i == -1) {
if (bucket != null) {
plan.docs = bucket;
}
} else {
if (bucket != null) {
plan.subPlans[i-1] = generatePlan(bucket);
}
}
i++;
}
}
return plan;
}
private int writePlan(PagePlan plan, int parentPage) {
int newPage = pages.newPageIndex();
if (plan.subPlans == null) {
// a leaf page
TermInDocRec prev = null;
VarPosition position = new VarPosition(pages.getByteOffestForPage(newPage ));
VarPosition cap = new VarPosition(pages.getByteOffestForPage(newPage+1));
for(TermInDocRec rec : plan.docs) {
writeTermInDocRecAndValue(prev, rec, position, cap);
prev = rec;
}
} else {
// a trie page
long intOffset = pages.getIntOffestForPage(newPage);
// link up + summary list + link complete list + 256 links down
pageBi.putInt(intOffset++, parentPage);
pageBi.putInt(intOffset++, 0);
if ((plan.docs != null) && (plan.docs.size() > 0)) {
pageBi.putInt(intOffset++, writePlan(new PagePlan(plan.docs), newPage));
} else {
pageBi.putInt(intOffset++, 0);
}
pageBi.putInt(intOffset++, 0);
for(PagePlan subPlan : plan.subPlans) {
if (subPlan == null) {
pageBi.putInt(intOffset++, 0);
} else {
int subPageId = writePlan(subPlan, newPage);
pageBi.putInt(intOffset++, subPageId);
}
}
}
return newPage;
}
private long descendOffset(int page, byte byteValue) {
return pages.getIntOffestForPage(page) + 3 + (byteValue & 0xff);
}
private int descend(int page, byte byteValue) {
long intOffset = descendOffset(page, byteValue);
return pageBi.getInt(intOffset);
}
/** page is assumed to be a multi-valued leaf page */
private void split(int parentPage, byte byteValue) {
int page = descend(parentPage, byteValue);
ArrayList<TermInDocRec> recs = parseMultiValuedLeafPage(page);
Collections.sort(recs);
PagePlan plan = generatePlan(recs);
int newRootPage = writePlan(plan, parentPage);
// swap the parent over to the new page
pageBi.putInt(descendOffset(page, byteValue), newRootPage);
// mark the original for deletion
pages.markForDeletion(page);
}
private ArrayList<dovetaildb.bagindex.OriginalTrieBagIndex.TermInDocRec> parseMultiValuedLeafPage(int page) {
ArrayList<TermInDocRec> recs = new ArrayList<TermInDocRec>();
long firstPage = page;
long nextPage = -1;
do {
long byteOffset = pages.getByteOffestForPage(page)+8L;
VarPosition top = new VarPosition(pages.getByteOffestForPage(page+1));
VarPosition vp = new VarPosition(byteOffset);
while (true) {
TermInDocRec rec = this.readTermInDocRec(null, vp, top);
if (rec == null) break;
else recs.add(rec);
}
nextPage = pageBi.getUInt(pages.getIntOffestForPage(page)+1);
} while(firstPage != nextPage);
return recs;
}
private int makeLeafPageUsing(List<TermInDocRec> bucket, int newTriePage) {
int page = pages.newPageIndex();
byte[] firstTerm = bucket.get(0).term;
boolean isSingleValued = true;
for(TermInDocRec rec : bucket) {
if (Util.compareBytes(firstTerm, rec.term) != 0) {
isSingleValued = false;
break;
}
}
long byteOffset = pages.getByteOffestForPage(page);
long byteOffsetCap = pages.getByteOffestForPage(page+1);
long intOffset = pages.getIntOffestForPage(page);
if (isSingleValued) {
pageBi.putInt(intOffset, SINGLE_MASK);
VarPosition vp = new VarPosition(byteOffset + 4);
for(TermInDocRec rec : bucket) {
pageBi.putVLong(vp, rec.docId, byteOffsetCap);
}
pageBi.putVLong(vp, 0, byteOffsetCap);
} else {
pageBi.putByte(byteOffset++, (byte)0);
VarPosition vp = new VarPosition(byteOffset);
for(TermInDocRec rec : bucket) {
pageBi.putVLong(vp, rec.docId, byteOffsetCap);
pageBi.putVLong(vp, rec.term.length, byteOffsetCap);
byteOffsetCap -= rec.term.length;
pageBi.putBytes(byteOffsetCap, rec.term.length, rec.term, 0);
}
pageBi.putVLong(vp, 0, byteOffsetCap);
}
return page;
}
protected void backUp(VarPosition vp) {
long i = vp.position - 2;
while ((pageBi.getByte(i) & 0x80) == 0) {
i--;
}
vp.position = i + 1;
}
protected boolean writeTermInDocRecAndValue(TermInDocRec prev, TermInDocRec rec, VarPosition position, VarPosition top) {
int len = rec.getLength();
if (len <= overflowThreshold) {
if (position.position - top.position <= len) return false;
if (! writeTermInDocRec(prev, rec, position, top.position - len)) return false;
top.position -= len;
return true;
} else {
return writeTermInDocRec(prev, rec, position, top.position);
}
}
protected boolean writeTermInDocRec(TermInDocRec prev, TermInDocRec rec, VarPosition position, long cap) {
if (prev.compareTo(rec) == 0) {
return pageBi.putVLong(position, (rec.docId<<1) | 0x01, cap);
} else {
if (! pageBi.putVLong(position, rec.docId<<1, cap)) return false;
int len = rec.getLength();
if (len <= overflowThreshold) {
if (pageBi.putVLong(position, (len<<1) | 0x01, cap)) return true;
} else {
if (pageBi.putVLong(position, (rec.termValueOffset<<1), cap)) {
if (pageBi.putVLong(position, len, cap)) return true;
else backUp(position);
}
}
// success returns immediately, this is a fail:
backUp(position);
pageBi.putVLong(position, 0, cap);
return false;
}
}
// returns null when page is exhausted
protected TermInDocRec readTermInDocRec(TermInDocRec prev, VarPosition position, VarPosition top) {
long docId = pageBi.getVLong(position);
boolean sameVal = (docId & 0x01) == 0x01;
docId >>= 1;
if (docId == 0) {
return null;
} else if (sameVal) {
return new TermInDocRec(docId, prev);
} else {
int valLen = (int)pageBi.getVLong(position);
boolean isInline = (valLen & 0x01) == 0x01;
valLen >>= 1;
if (isInline) {
top.position -= valLen;
byte[] literal = new byte[valLen];
pageBi.getBytes(top.position, valLen, literal, 0);
return new TermInDocRec(literal, docId);
} else {
long valPos = pageBi.getVLong(position);
return new TermInDocRec(docId, valPos, valLen);
}
}
}
protected void insertTerm(TermInDocRec rec) {
LookedUpTerm ret = findLeafForTerm(rec.getTerm());
int termPrefixLen = ret.termPrefixLen;
int parentTriePage = ret.parentPage;
int firstLeafPage = ret.page;
int lastLeafPage = this.getPrevLeafPage(firstLeafPage); // get last page
VarPosition pos = getDocsStartForPage(lastLeafPage);
VarPosition top = getDocsCapForPage(lastLeafPage);
TermInDocRec cur = null;
TermInDocRec prev = null;
boolean allSame = true;
do {
cur = readTermInDocRec(prev, pos, top);
if (allSame && prev != null && ! cur.equals(prev)) allSame = false;
} while(cur!=null);
boolean wroteIt = writeTermInDocRecAndValue(prev, rec, pos, top);
if (wroteIt) {
return;
}
// page full, either add a new page or split the existing set
if (allSame) {
// add page; split may be unproductive (last page all had the same value)
int newPage = pages.newPageIndex();
setNextLeafPage(newPage, firstLeafPage);
setPrevLeafPage(newPage, lastLeafPage);
pos = getDocsStartForPage(newPage);
top = getDocsCapForPage( newPage);
prev=null;
wroteIt = writeTermInDocRecAndValue(prev, rec, pos, top);
if (! wroteIt) throw new RuntimeException();
setNextLeafPage(lastLeafPage, newPage);
setPrevLeafPage(firstLeafPage, newPage);
return;
}
// split the page
byte lastByte = rec.getByteAt(termPrefixLen-1);
split(parentTriePage, lastByte);
// and try again
insertTerm(rec);
}
private VarPosition getDocsStartForPage(int leafPage) {
long offset = pages.getByteOffestForPage(leafPage);
// link prev:4 + link next:4 + doc list
return new VarPosition(offset+8);
}
private VarPosition getDocsCapForPage(int leafPage) {
return new VarPosition(pages.getByteOffestForPage(leafPage+1));
}
OriginalTrieBagIndex() {
this(1036, 8);
}
OriginalTrieBagIndex(int pageSize, int overflowThreshold) {
this.pageSize = 0;
this.overflowThreshold = overflowThreshold;
this.minRecsPerPage = (pageSize - 8)/(overflowThreshold+20);
}
@Override
public void close() {
pages.close();
}
@Override
public long commitNewRev(long[] deletions,
Collection<Pair<byte[][], byte[][]>> inserts) {
int numInserts = inserts.size();
long revNum = maxDocId + numInserts;
if (numInserts == 0) {
// commits with only deletions occupy a docId space, so pre-delete it
inserts.add(new Pair<byte[][],byte[][]>(null,null));
revNum++;
}
Arrays.sort(deletions);
deleteInRev(deletions, revNum);
long termsBaseOffset = terms.getLength();
ByteArrayOutputStream docEntryBuf = new ByteArrayOutputStream();
ByteArrayOutputStream termValueBuf = new ByteArrayOutputStream();
VIntScanner.writeVLong(docEntryBuf, (long)termValueBuf.size());
byte[][][] groups = new byte[2][][];
for(Pair<byte[][],byte[][]> docPair : inserts) {
byte[][] indexTerms = docPair.getLeft();
byte[][] storeTerms = docPair.getRight();
groups[0] = storeTerms;
groups[1] = indexTerms;
for(byte[][] insertTerms : groups) {
for(byte[] term : insertTerms) {
long sz = term.length;
VIntScanner.writeVLong(docEntryBuf, sz);
long pos = termsBaseOffset + (long)termValueBuf.size();
try { termValueBuf.write(term); }
catch (IOException e) { throw new RuntimeException(e); }
if (insertTerms == indexTerms) {
TermInDocRec rec = new TermInDocRec(term, maxDocId, pos);
insertTerm(rec);
}
}
}
maxDocId++;
}
return maxDocId;
}
protected void deleteInRev(long[] deletions, long revNum) {
byte[] delTerm = new byte[2+8];
delTerm[0] = 0;
delTerm[1] = 'd';
TermInDocRec rec = new TermInDocRec(delTerm, revNum, 2+8);
for(long deletion : deletions) {
Util.beLongToBytes(deletion, delTerm, 2);
this.insertTerm(rec);
}
}
@Override
public Scanner fetchRange(byte[] term1, byte[] term2, boolean isExclusive1,
boolean isExclusive2, long revNum) {
// TODO Auto-generated method stub
return null;
}
@Override
public Scanner fetchTd(byte[] term, long revNum) {
// TODO Auto-generated method stub
return null;
}
@Override
public Scanner fetchAll(long revNum) {
return new IntegerScanner(revNum);
}
@Override
public Scanner fetchDeletions(long revNum) {
// TODO Auto-generated method stub
return null;
}
@Override
public BagIndexDoc fetchDoc(long docId) {
// TODO Auto-generated method stub
return null;
}
@Override
public BagIndexDoc refetchDoc(BagIndexDoc doc, long docId) {
return fetchDoc(docId);
}
@Override
public String getHomedir() {
// TODO Auto-generated method stub
return null;
}
@Override
public void setHomedir(String homeDir) {
// TODO Auto-generated method stub
}
@Override
public long getCurrentRevNum() {
return maxDocId;
}
}