/**
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.solbase.lucenehbase;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.HTableInterface;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermVectorOffsetInfo;
import org.solbase.SolbaseUtil;
public class TermFreqVector implements org.apache.lucene.index.TermFreqVector, org.apache.lucene.index.TermPositionVector {
private String field;
private byte[] docId;
private String[] terms;
private int[] freqVec;
private int[][] termPositions;
private TermVectorOffsetInfo[][] termOffsets;
public TermFreqVector(String field, int docIdInt) throws IOException, ClassNotFoundException {
this.field = field;
this.docId = SolbaseUtil.writeVInt(docIdInt);
HTableInterface docTable = SolbaseUtil.getDocTable();
HTableInterface termVectorTable = SolbaseUtil.getTermVectorTable();
try {
Get documentGet = new Get(docId);
documentGet.addColumn(Bytes.toBytes("allTerms"), Bytes.toBytes("allTerms"));
Result documentResult = docTable.get(documentGet);
if (documentResult.isEmpty()) {
return; // this docId is missing
}
@SuppressWarnings("unchecked")
List<Term> allTerms = (List<Term>) SolbaseUtil.fromBytes(ByteBuffer.wrap(documentResult.getValue(Bytes.toBytes("allTerms"), Bytes.toBytes("allTerms"))));
List<Result> termResults = new ArrayList<Result>();
for (Term t : allTerms) {
byte[] termVecKey = Bytes.add(SolbaseUtil.generateTermKey(t), SolbaseUtil.delimiter, docId);
Get germVectorGet = new Get(termVecKey);
Result termVecGetReult = docTable.get(germVectorGet);
termResults.add(termVecGetReult);
}
terms = new String[termResults.size()];
freqVec = new int[termResults.size()];
termPositions = new int[termResults.size()][];
termOffsets = new TermVectorOffsetInfo[termResults.size()][];
int i = 0;
for (Result row : termResults) {
byte[] fieldName = row.getValue(Bytes.toBytes("field"), Bytes.toBytes("field"));
byte[] termTextName = row.getValue(Bytes.toBytes("term"), Bytes.toBytes("term"));
Term t = new Term(Bytes.toString(fieldName), Bytes.toString(termTextName));
terms[i] = t.text();
byte[] documentTermInfo = row.getValue(Bytes.toBytes("document"), docId);
// Find the offsets and positions
TermDocMetadata termInfo = new TermDocMetadata(0, documentTermInfo);
termPositions[i] = termInfo.getPositions();
freqVec[i] = termPositions[i].length;
if (termInfo == null || !termInfo.hasOffsets()) {
termOffsets[i] = TermVectorOffsetInfo.EMPTY_OFFSET_INFO;
} else {
int[] offsets = termInfo.getOffsets();
termOffsets[i] = new TermVectorOffsetInfo[freqVec[i]];
for (int j = 0, k = 0; j < offsets.length; j += 2, k++) {
termOffsets[i][k] = new TermVectorOffsetInfo(offsets[j], offsets[j + 1]);
}
}
i++;
}
} finally {
SolbaseUtil.releaseTable(docTable);
SolbaseUtil.releaseTable(termVectorTable);
}
}
public String getField() {
return field;
}
public int[] getTermFrequencies() {
return freqVec;
}
public String[] getTerms() {
return terms;
}
public int indexOf(String term) {
return Arrays.binarySearch(terms, term);
}
public int[] indexesOf(String[] terms, int start, int len) {
int[] res = new int[terms.length];
for (int i = 0; i < terms.length; i++) {
res[i] = indexOf(terms[i]);
}
return res;
}
public int size() {
return terms.length;
}
public TermVectorOffsetInfo[] getOffsets(int index) {
return termOffsets[index];
}
public int[] getTermPositions(int index) {
return termPositions[index];
}
}