package org.solbase.lucenehbase;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.security.MessageDigest;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.NavigableMap;
import java.util.StringTokenizer;
import java.util.concurrent.TimeoutException;
import net.rubyeye.xmemcached.exception.MemcachedException;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.HTableInterface;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.log4j.Logger;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.EmbeddedSortField;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.Term;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.schema.EmbeddedIndexedIntField;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.update.DocumentBuilder;
import org.solbase.SolbaseFieldSelector;
import org.solbase.SolbaseShardUtil;
import org.solbase.SolbaseUtil;
import org.solbase.cache.CachedObjectLoader;
import org.solbase.cache.CachedObjectWrapper;
import org.solbase.cache.LayeredCache;
import org.solbase.cache.LayeredCache.ModificationType;
import org.solbase.indexer.ParsedDoc;
import org.solbase.indexer.SolbaseIndexUtil;
public class DocumentLoader implements CachedObjectLoader<Integer, Document, Long, ParsedDoc> {
private final static Logger logger = Logger.getLogger(DocumentLoader.class);
private List<byte[]> fieldNames = new ArrayList<byte[]>();
private IndexSchema schema = null;
private HTableInterface docTable = null;
public DocumentLoader(FieldSelector selector) {
if (selector != null && selector instanceof SolbaseFieldSelector) {
this.fieldNames = ((SolbaseFieldSelector) selector).getFieldNames();
}
}
public DocumentLoader(FieldSelector selector, IndexSchema schema) {
if (selector != null && selector instanceof SolbaseFieldSelector) {
this.fieldNames = ((SolbaseFieldSelector) selector).getFieldNames();
}
this.schema = schema;
}
public DocumentLoader(FieldSelector selector, IndexSchema schema, HTableInterface htable) {
this(selector, schema);
this.docTable = htable;
}
public CachedObjectWrapper<Document, Long> loadObject(Integer docNum, int start, int end, LayeredCache<Integer,Document,Long,ParsedDoc> cache) throws IOException {
Document document = new Document();
Get documentGet = new Get(SolbaseUtil.randomize(docNum));
if (fieldNames == null || fieldNames.size() == 0) {
// get all columns ( except this skips meta info )
documentGet.addFamily(Bytes.toBytes("field"));
} else {
for (byte[] fieldName : fieldNames) {
documentGet.addColumn(Bytes.toBytes("field"), fieldName);
}
}
Result documentResult = null;
// if docTable is set up, reuse instance, otherwise create brand new one and close after done
if(this.docTable == null){
HTableInterface docTable = null;
try {
docTable = SolbaseUtil.getDocTable();
documentResult = docTable.get(documentGet);
} finally {
SolbaseUtil.releaseTable(docTable);
}
} else {
documentResult = this.docTable.get(documentGet);
}
if (documentResult == null || documentResult.isEmpty()) {
return null;
}
Long versionIdentifier = 0l;// TODO, get from result
NavigableMap<byte[], byte[]> familyMap = documentResult.getFamilyMap(Bytes.toBytes("field"));
for (Map.Entry<byte[], byte[]> fieldColumn : familyMap.entrySet()) {
Field field = null;
String fieldName = Bytes.toString(fieldColumn.getKey());
byte[] value;
ByteBuffer v = ByteBuffer.wrap(fieldColumn.getValue());
int vlimit = v.limit() + v.arrayOffset();
if (v.array()[vlimit - 1] != Byte.MAX_VALUE && v.array()[vlimit - 1] != Byte.MIN_VALUE) {
throw new CorruptIndexException("Solbase field is not properly encoded: " + docNum + "(" + fieldName + ")");
} else if (v.array()[vlimit - 1] == Byte.MAX_VALUE) { // Binary
value = new byte[vlimit - 1];
System.arraycopy(v.array(), v.position() + v.arrayOffset(), value, 0, vlimit - 1);
field = new Field(fieldName, value, Store.YES);
document.add(field);
} else if (v.array()[vlimit - 1] == Byte.MIN_VALUE) { // String
value = new byte[vlimit - 1];
System.arraycopy(v.array(), v.position() + v.arrayOffset(), value, 0, vlimit - 1);
// Check for multi-fields
String fieldString = new String(value, "UTF-8");
if (fieldString.indexOf(Bytes.toString(SolbaseUtil.delimiter)) >= 0) {
StringTokenizer tok = new StringTokenizer(fieldString, Bytes.toString(SolbaseUtil.delimiter));
while (tok.hasMoreTokens()) {
// update logic
if(schema != null){
SchemaField sfield = schema.getFieldOrNull(fieldName);
if (sfield.getType() instanceof EmbeddedIndexedIntField) {
EmbeddedIndexedIntField eiif = (EmbeddedIndexedIntField)sfield.getType();
EmbeddedSortField sf = new EmbeddedSortField(fieldName,
tok.nextToken(),
Field.Store.YES,
Field.Index.NO,
eiif.getFieldNumber());
document.add(sf);
} else {
Field f = sfield.createField(tok.nextToken(), 1.0f);
if (f != null) { // null fields are not added
document.add(f);
}
}
} else {
field = new Field(fieldName, tok.nextToken(), Store.YES, Index.ANALYZED);
document.add(field);
}
}
} else {
// update logic
if(schema != null){
SchemaField sfield = schema.getFieldOrNull(fieldName);
if (sfield.getType() instanceof EmbeddedIndexedIntField) {
EmbeddedIndexedIntField eiif = (EmbeddedIndexedIntField)sfield.getType();
EmbeddedSortField sf = new EmbeddedSortField(fieldName,
fieldString,
Field.Store.YES,
Field.Index.NO,
eiif.getFieldNumber());
document.add(sf);
} else {
Field f = sfield.createField(fieldString, 1.0f);
if (f != null) { // null fields are not added
document.add(f);
}
}
} else {
field = new Field(fieldName, fieldString, Store.YES, Index.ANALYZED);
document.add(field);
}
}
}
}
return new CachedObjectWrapper<Document, Long>(document, versionIdentifier, System.currentTimeMillis());
}
public Long getVersionIdentifier(Integer key, int startDocId, int endDocId) throws IOException {
/*
Get documentGet = new Get(Bytes.toBytes(key));
//TODO add appropriate column documentGet.addColumn(Bytes.toBytes("field"), fieldName);
HTableInterface docTable = null;
Result documentResult = null;
try {
docTable = SolbaseUtil.getDocTable();
documentResult = docTable.get(documentGet);
} finally {
SolbaseUtil.releaseTable(docTable);
}
if (documentResult == null || documentResult.isEmpty()) {
return null;
}
*/
Long versionIdentifier = null;// TODO, get from result
return versionIdentifier;
}
@Override
public void updateObject(CachedObjectWrapper<Document, Long> object, ParsedDoc modificationData, LayeredCache<Integer, Document, Long, ParsedDoc> cache, LayeredCache.ModificationType modType, int startDocId, int endDocId) throws IOException {
if (modType == ModificationType.DELETE) {
//don't want to delete, because someone else might still be using, just let it fall out of LRU cache
Document oldDoc = object.getValue();
if(oldDoc != null){
// other cluster might have already deleted this doc and cache doesn't have this doc
modificationData.copyFrom(deleteDocument(oldDoc, Integer.parseInt(oldDoc.getField("docId").stringValue()), modificationData.getIndexName(), modificationData.getIndexWriter(),
modificationData.getIndexUtil(), modificationData.getUpdateStore(), startDocId, endDocId));
}
} else if (modType == LayeredCache.ModificationType.UPDATE){
Document newDoc = modificationData.getDocument();
Document oldDoc = object.getValue();
logger.debug("process document() call in updateObject() for docId: " + Integer.parseInt(oldDoc.getField("docId").stringValue()));
modificationData.copyFrom(processDocument(newDoc, oldDoc, modificationData.getIndexName(), Integer.parseInt(oldDoc.getField("docId").stringValue()), modificationData.getIndexUtil(),
modificationData.getIndexWriter(), modificationData.getUpdateStore()));
object.setValue(modificationData.getDocument());
} else if(modType == LayeredCache.ModificationType.ADD) {
// TODO: it should never hit here, newly added doc is obviously not going to be in cache
Document oldDoc = object.getValue();
logger.warn("it should never hit here, newly added doc should never be in cache: " + oldDoc.toString());
}
}
@Override
public void updateObjectStore(Integer key, ParsedDoc modificationData, IndexWriter writer, LayeredCache<Integer, Document, Long, ParsedDoc> cache, LayeredCache.ModificationType modType, int startDocId, int endDocId) throws IOException {
Put documentPut = modificationData.getDocumentPut();
Document doc = modificationData.getDocument();
if(modType == LayeredCache.ModificationType.DELETE){
// other cluster might have already deleted this doc and cache doesn't have this doc
CachedObjectWrapper<Document, Long> cachedObj = loadObject(key, 0, 0, cache);
if(cachedObj != null){
Document oldDoc = cachedObj.getValue();
int docId = Integer.parseInt(oldDoc.getField("docId").stringValue());
modificationData.copyFrom(deleteDocument(oldDoc, docId, modificationData.getIndexName(), modificationData.getIndexWriter(),
modificationData.getIndexUtil(), modificationData.getUpdateStore(), startDocId, endDocId));
// let's clean up Docs and DocKeyIdMap tables after deleting doc from term vector
// we are tombstoning doc and dockeyidmap rows because of race condition in mutli clustered environment
String globalUniqId = oldDoc.get("global_uniq_id");
Put mappingPut = new Put(Bytes.toBytes(globalUniqId));
mappingPut.add(SolbaseUtil.docIdColumnFamilyName, SolbaseUtil.tombstonedColumnFamilyQualifierBytes, Bytes.toBytes(1));
writer.deleteDocument(documentPut);
writer.updateDocKeyIdMap(mappingPut);
}
} else if(modType == LayeredCache.ModificationType.UPDATE){
// TODO: might want to refactor this logic
// we are currently always loading Doc before calling updateObject.
// within updateObject, it should always call processDocument() which will set documentPut obj
writer.updateDocument(documentPut, modificationData.getDocument());
} else if(modType == LayeredCache.ModificationType.ADD){
writer.addDocument(documentPut, doc);
}
}
private ParsedDoc deleteDocument(Document oldDoc, int docId, String indexName, IndexWriter writer, SolbaseIndexUtil indexUtil, boolean updateStore, int startDocId, int endDocId){
try {
// clone so read won't conflict
oldDoc = new Document(oldDoc);
oldDoc.removeField("docId");
ParsedDoc parsedDoc = writer.parseDoc(oldDoc, schema.getAnalyzer(), indexName, docId, indexUtil.getSortFieldNames());
List<TermDocMetadata> metadatas = parsedDoc.getTermDocMetadatas();
// TODO: doing duplicate work here - once from updateObject and again from updateObjectStore
for (TermDocMetadata metadata : metadatas) {
ReaderCache.updateTermDocsMetadata(metadata.getTerm(), metadata, indexName, writer, LayeredCache.ModificationType.DELETE, updateStore, startDocId, endDocId);
}
return parsedDoc;
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (MemcachedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (TimeoutException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return null;
}
private ParsedDoc processDocument(Document newDoc, Document oldDoc, String indexName, int docNumber, SolbaseIndexUtil indexUtil, IndexWriter writer, boolean updateStore){
try {
@SuppressWarnings("unchecked")
List<Fieldable> newFields = newDoc.getFields();
boolean termVectorChanged = false;
for (Fieldable field : newFields) {
if (field.isIndexed() || field instanceof EmbeddedSortField) {
termVectorChanged = true;
break;
}
}
// one of field value that's indexed has changed. so we need to
// do diff on terms
if (termVectorChanged) {
Field docIdField = oldDoc.getField("docId");
// cloning old doc, so it won't conflict with read
oldDoc = new Document(oldDoc);
oldDoc.removeField("docId");
// parsing old doc to get all terms
try {
ParsedDoc oldParsedDoc = writer.parseDoc(oldDoc, schema.getAnalyzer(), indexName, docNumber, indexUtil.getSortFieldNames());
List<Term> oldTerms = oldParsedDoc.getAllTerms();
List<TermDocMetadata> oldTermDocMetas = oldParsedDoc.getTermDocMetadatas();
Document mergedDoc = mergeOldAndNew(oldDoc, newDoc);
ParsedDoc parsedDoc = writer.parseDoc(mergedDoc, schema.getAnalyzer(), indexName, docNumber, indexUtil.getSortFieldNames());
List<TermDocMetadata> newTermDocMetas = parsedDoc.getTermDocMetadatas();
List<Term> newTerms = parsedDoc.getAllTerms();
List<Term> updateList = new ArrayList<Term>(oldTerms);
List<Term> deleteList = new ArrayList<Term>(oldTerms);
List<Term> addList = new ArrayList<Term>(newTerms);
Collections.copy(updateList, oldTerms);
Collections.copy(deleteList, oldTerms);
Collections.copy(addList, newTerms);
updateList.retainAll(newTerms);
deleteList.removeAll(newTerms);
addList.removeAll(oldTerms);
int shardNum = SolbaseShardUtil.getShardNum(indexName);
int startDocId = SolbaseShardUtil.getStartDocId(shardNum);
int endDocId = SolbaseShardUtil.getEndDocId(shardNum);
// updating tv first
for (TermDocMetadata termDocMeta : newTermDocMetas) {
Term term = termDocMeta.getTerm();
if (updateList.contains(term)) {
logger.debug("updating this term: " + term.toString());
ReaderCache.updateTermDocsMetadata(term, termDocMeta, indexName, writer, LayeredCache.ModificationType.UPDATE, updateStore, startDocId, endDocId);
} else if (addList.contains(term)) {
ReaderCache.updateTermDocsMetadata(term, termDocMeta, indexName, writer, LayeredCache.ModificationType.ADD, updateStore, startDocId, endDocId);
}
}
// clean up deletes
if (deleteList.size() > 0) {
for (TermDocMetadata termDocMeta : oldTermDocMetas) {
Term term = termDocMeta.getTerm();
if (deleteList.contains(term)) {
ReaderCache.updateTermDocsMetadata(term, termDocMeta, indexName, writer, LayeredCache.ModificationType.DELETE, updateStore, startDocId, endDocId);
}
}
}
parsedDoc.getDocument().add(docIdField);
return parsedDoc;
} catch (NullPointerException e) {
return null;
}
} else {
Document mergedDoc = mergeOldAndNew(oldDoc, newDoc);
ParsedDoc parsedDoc = writer.parseDoc(mergedDoc, schema.getAnalyzer(), indexName, docNumber, indexUtil.getSortFieldNames());
return parsedDoc;
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (MemcachedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (TimeoutException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return null;
}
private Document mergeOldAndNew(Document oldDoc, Document newDoc){
SolrInputDocument newInputDoc = new SolrInputDocument();
@SuppressWarnings("unchecked")
List<Fieldable> newFields = newDoc.getFields();
for (Fieldable field : newFields) {
String fieldName = field.name();
String fieldValue = field.stringValue();
newInputDoc.addField(fieldName, fieldValue);
oldDoc.removeField(fieldName);
}
@SuppressWarnings("unchecked")
List<Fieldable> oldFields = oldDoc.getFields();
for (Fieldable field : oldFields) {
String fieldName = field.name();
String fieldValue = field.stringValue();
newInputDoc.addField(fieldName, fieldValue);
}
Document mergedDoc = DocumentBuilder.toDocument(newInputDoc, schema);
return mergedDoc;
}
}