/*
* Ivory: A Hadoop toolkit for web-scale information retrieval
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package ivory.core.data.document;
import ivory.core.RetrievalEnvironment;
import ivory.core.preprocess.BuildIntDocVectorsForwardIndex;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.log4j.Logger;
import com.google.common.base.Preconditions;
import edu.umd.cloud9.debug.MemoryUsageUtils;
/**
* Object providing an index into one or more {@code SequenceFile}s
* containing {@link IntDocVector}s, providing random access to the document
* vectors.
*
* @see BuildIntDocVectorsForwardIndex
*
* @author Jimmy Lin
*/
public class IntDocVectorsForwardIndex {
private static final Logger LOG = Logger.getLogger(IntDocVectorsForwardIndex.class);
private static final NumberFormat FORMAT = new DecimalFormat("00000");
private final FileSystem fs;
private final Configuration conf;
private final long[] positions;
private final String path;
private final int docnoOffset;
private final int collectionDocumentCount;
/**
* Creates an {@code IntDocVectorsIndex} object.
*
* @param indexPath location of the index file
* @param fs handle to the FileSystem
* @throws IOException
*/
public IntDocVectorsForwardIndex(String indexPath, FileSystem fs) throws IOException {
this(indexPath, fs, false);
}
/**
* Creates an {@code IntDocVectorsIndex} object.
*
* @param indexPath location of the index file
* @param fs handle to the FileSystem
* @param weighted {@code true} to load weighted document vectors
* @throws IOException
*/
public IntDocVectorsForwardIndex(String indexPath, FileSystem fs, boolean weighted)
throws IOException {
this.fs = Preconditions.checkNotNull(fs);
this.conf = fs.getConf();
RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
path = (weighted ? env.getWeightedIntDocVectorsDirectory() : env.getIntDocVectorsDirectory());
String forwardIndexPath = (weighted ? env.getWeightedIntDocVectorsForwardIndex()
: env.getIntDocVectorsForwardIndex());
FSDataInputStream posInput = fs.open(new Path(forwardIndexPath));
docnoOffset = posInput.readInt();
collectionDocumentCount = posInput.readInt();
positions = new long[collectionDocumentCount];
for (int i = 0; i < collectionDocumentCount; i++) {
positions[i] = posInput.readLong();
}
}
/**
* Returns the document vector given a docno.
*
* @return {@code IntDocVector} for the appropriate docno
*/
public IntDocVector getDocVector(int docno) throws IOException {
Preconditions.checkArgument(!(docno > collectionDocumentCount || docno < 1));
long pos = positions[docno - docnoOffset - 1];
int fileNo = (int) (pos / BuildIntDocVectorsForwardIndex.BigNumber);
pos = pos % BuildIntDocVectorsForwardIndex.BigNumber;
SequenceFile.Reader reader = null;
try {
reader = new SequenceFile.Reader(fs,
new Path(path + "/part-m-" + FORMAT.format(fileNo)), conf);
} catch (IOException e) {
// Try alternative naming scheme for the old API.
reader = new SequenceFile.Reader(fs,
new Path(path + "/part-" + FORMAT.format(fileNo)), conf);
}
IntWritable key = new IntWritable();
IntDocVector value;
try {
value = (IntDocVector) reader.getValueClass().newInstance();
} catch (Exception e) {
throw new RuntimeException("Unable to instantiate key/value pair!");
}
reader.seek(pos);
reader.next(key, value);
if (key.get() != docno) {
LOG.error("unable to doc vector for docno " + docno + ": found docno " + key
+ " instead");
return null;
}
reader.close();
return value;
}
/**
* Simple test program.
*/
public static void main(String[] args) throws Exception {
if (args.length != 1) {
System.out.println("usage: [indexPath]");
System.exit(-1);
}
long startingMemoryUse = MemoryUsageUtils.getUsedMemory();
Configuration conf = new Configuration();
IntDocVectorsForwardIndex index = new IntDocVectorsForwardIndex(args[0], FileSystem.get(conf));
long endingMemoryUse = MemoryUsageUtils.getUsedMemory();
System.out.println("Memory usage: " + (endingMemoryUse - startingMemoryUse) + " bytes\n");
String term = null;
BufferedReader stdin = new BufferedReader(new InputStreamReader(System.in));
System.out.print("Look up postings of doc > ");
while ((term = stdin.readLine()) != null) {
int docno = Integer.parseInt(term);
System.out.println(docno + ": " + index.getDocVector(docno));
System.out.print("Look up postings of doc > ");
}
}
}