package org.solbase.indexer.mapreduce;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.List;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.MapWritable;
import org.apache.lucene.document.Document;
import org.solbase.SolbaseUtil;
import org.solbase.indexer.ParsedDoc;
import org.solbase.indexer.SolbaseIndexUtil;
import org.solbase.indexer.writable.DocumentPutWritable;
import org.solbase.indexer.writable.TermDocMetadataWritable;
import org.solbase.lucenehbase.TermDocMetadata;
public class SolbaseInitialIndexMapper extends TableMapper<BytesWritable, MapWritable> {
private HTable docTable = null;
private HTable docKeyIdMapTable = null;
private Integer docId = null;
private int idCounter = 0;
private SolbaseIndexUtil indexerUtil = null;
// counting how man rows we are indexing
public static enum Counters {
TOTAL_ROWS,INDEXED_ROWS,INSUFFICIENT_META_ROWS,INVALIDATED_ROWS,PRIVATE_ROWS,PARTNER_ROWS,VIDEO_ROWS,FLASH_ROWS, NEWEST_ROWS
}
protected void cleanup(Context context) throws IOException {
// clean up remaining docs in buffer
docTable.flushCommits();
docKeyIdMapTable.flushCommits();
SolbaseUtil.releaseTable(docTable);
SolbaseUtil.releaseTable(docKeyIdMapTable);
}
protected void setup(Context context) throws IOException {
this.docTable = (HTable) SolbaseUtil.getLocalDocTable();
this.docKeyIdMapTable = (HTable) SolbaseUtil.getLocalDocKeyIdMapTable();
String indexerUtilClassName = context.getConfiguration().get("indexerUtil");
try {
indexerUtil = (SolbaseIndexUtil) Class.forName(indexerUtilClassName).newInstance();
} catch (ClassNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (InstantiationException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IllegalAccessException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
protected void map(ImmutableBytesWritable row, Result values, Context context) throws IOException {
context.getCounter(Counters.TOTAL_ROWS).increment(1);
context.setStatus(context.getCounter(Counters.TOTAL_ROWS) + "");
// global id is user_media row key
String globalId = Bytes.toString(row.get());
Document doc = indexerUtil.createLuceneDocument(Bytes.toString(row.get()), values, context);
byte[] checksum = values.getValue(Bytes.toBytes("meta"), Bytes.toBytes("checksum"));
if(doc == null){
// validation must have failed if it returned null
return;
}
// in this case, i don't care to look up DocKeyIdMap to see if doc
// exists already
if (this.idCounter > (SolbaseUtil.UNIQ_ID_CHUNK - 1) || docId == null) {
docId = SolbaseUtil.generateUniqId();
this.idCounter = 0;
} else {
docId--;
}
// for us, docId is going to be global uniq id, meaning we are tied to 2 billion docs limitation
// it doesn't really hurt to add this field to doc. and it only really matters when sharding comes in, trying to fetch docs by their docid
indexerUtil.addFieldToDoc(doc, "docId", docId + "");
// incrementing chunking sequence (lucene doc id)
this.idCounter++;
try {
ParsedDoc parsedDoc = indexerUtil.getIndexWriter().parseDoc(doc, indexerUtil.getAnalyzer(), "", docId, indexerUtil.getSortFieldNames());
List<TermDocMetadata> metadatas = parsedDoc.getTermDocMetadatas();
MapWritable mapWritable = new MapWritable();
DocumentPutWritable docWritable = new DocumentPutWritable(parsedDoc.getFieldsMap(), parsedDoc.getAllTerms(), docId, globalId);
mapWritable.put(new BytesWritable(Bytes.toBytes("doc")), docWritable);
for (TermDocMetadata metadata : metadatas) {
byte[] key = metadata.getFieldTermKey();
ByteBuffer buf = metadata.serialize();
TermDocMetadataWritable writable = new TermDocMetadataWritable(docId, Bytes.toBytes(buf), key);
mapWritable.put(new BytesWritable(key), writable);
}
context.write(new BytesWritable(checksum), mapWritable);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}