package it.unimi.dsi.mg4j.document;
/*
* MG4J: Managing Gigabytes for Java
*
* Copyright (C) 2009-2010 Sebastiano Vigna
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 3 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*
*/
import it.unimi.dsi.Util;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import it.unimi.dsi.fastutil.objects.ObjectList;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.io.WordReader;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.mg4j.document.DocumentFactory.FieldType;
import it.unimi.dsi.mg4j.document.SimpleCompressedDocumentCollection.FrequencyCodec;
import it.unimi.dsi.mg4j.tool.Scan;
import it.unimi.dsi.mg4j.tool.Scan.VirtualDocumentFragment;
import java.io.DataOutputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.io.PrintStream;
import java.io.Reader;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.output.CountingOutputStream;
/** A builder for {@linkplain SimpleCompressedDocumentCollection simple compressed document collections}.
*
* @author Sebastiano Vigna
*/
public class SimpleCompressedDocumentCollectionBuilder implements DocumentCollectionBuilder {
/** The factory of the base document sequence. */
private final DocumentFactory factory;
/** Whether will are building an exact collection (i.e., whether it stores nonwords). */
private final boolean exact;
/** A frequency keeper used to compress document terms. */
private final FrequencyCodec termsFrequencyKeeper;
/** A frequency keeper used to compress document nonterms, or <code>null</code> if {@link #exact} is false. */
private final FrequencyCodec nonTermsFrequencyKeeper;
/** The basename of the builder. */
private String basename;
/** The basename of current collection. */
private String basenameSuffix;
/** The output bit stream for documents. */
private OutputBitStream documentsOutputBitStream;
/** The output stream for terms. */
private CountingOutputStream termsOutputStream;
/** The output stream for nonterms, or <code>null</code> if {@link #exact} is false. */
private CountingOutputStream nonTermsOutputStream;
/** The output bit stream for document offsets. */
private OutputBitStream documentOffsetsObs;
/** The output bit stream for term offsets. */
private OutputBitStream termOffsetsObs;
/** The output bit stream for nonterms offsets, or <code>null</code> if {@link #exact} is false. */
private OutputBitStream nonTermOffsetsObs;
/** A temporary cache for the content of a field as a list of global term numbers. If the collection is exact, it alternates terms and nonterms. */
private IntArrayList fieldContent;
/** The map from term to global term numbers, in order of appearance. */
private Object2IntOpenHashMap<MutableString> terms;
/** The map from term to global nonterm numbers, in order of appearance, or <code>null</code> if {@link #exact} is false. */
private Object2IntOpenHashMap<MutableString> nonTerms;
/** The number of documents indexed so far. */
private int documents;
/** The number of words indexed so far. */
private long words;
/** The number of fields indexed so far. */
private long fields;
/** The number of bits used to code words. */
private long bitsForWords;
/** The number of bits used to code nonwords. */
private long bitsForNonWords;
/** The number of bits used to code field lengths (the number of words/nonwords pairs). */
private long bitsForFieldLengths;
/** The number of bits used to code URIs. */
private long bitsForUris;
/** The number of bits used to code document titles. */
private long bitsForTitles;
/** Whether we are compressing non-text or virtual fields. */
private boolean hasNonText;
/** The zip output stream used to store non-text and virtual fields if {@link #hasNonText} is true, or <code>null</code> otherwise. */
private ZipOutputStream nonTextZipOutputStream;
/** {@link #nonTextZipOutputStream} wrapped in a {@link DataOutputStream}. */
private DataOutputStream nonTextZipDataOutputStream;
public SimpleCompressedDocumentCollectionBuilder( final String basename, final DocumentFactory factory, final boolean exact ) {
this.basename = basename;
this.factory = factory;
this.exact = exact;
this.termsFrequencyKeeper = new SimpleCompressedDocumentCollection.FrequencyCodec();
this.nonTermsFrequencyKeeper = exact ? new SimpleCompressedDocumentCollection.FrequencyCodec() : null;
boolean hasNonText = false;
for( int i = factory.numberOfFields(); i-- != 0; ) hasNonText |= factory.fieldType( i ) != FieldType.TEXT;
this.hasNonText = hasNonText;
terms = new Object2IntOpenHashMap<MutableString>( Scan.INITIAL_TERM_MAP_SIZE );
terms.defaultReturnValue( -1 );
if ( exact ) {
nonTerms = new Object2IntOpenHashMap<MutableString>( Scan.INITIAL_TERM_MAP_SIZE );
nonTerms.defaultReturnValue( -1 );
}
else nonTerms = null;
}
public String basename() {
return basename;
}
public void open( final CharSequence suffix ) throws IOException {
basenameSuffix = basename + suffix;
documentsOutputBitStream = new OutputBitStream( basenameSuffix + SimpleCompressedDocumentCollection.DOCUMENTS_EXTENSION );
termsOutputStream = new CountingOutputStream( new FastBufferedOutputStream( new FileOutputStream( basenameSuffix + SimpleCompressedDocumentCollection.TERMS_EXTENSION ) ) );
nonTermsOutputStream = exact ? new CountingOutputStream( new FastBufferedOutputStream( new FileOutputStream( basenameSuffix + SimpleCompressedDocumentCollection.NONTERMS_EXTENSION ) ) ) : null;
documentOffsetsObs = new OutputBitStream( basenameSuffix + SimpleCompressedDocumentCollection.DOCUMENT_OFFSETS_EXTENSION );
termOffsetsObs = new OutputBitStream( basenameSuffix + SimpleCompressedDocumentCollection.TERM_OFFSETS_EXTENSION );
nonTermOffsetsObs = exact? new OutputBitStream( basenameSuffix + SimpleCompressedDocumentCollection.NONTERM_OFFSETS_EXTENSION ) : null;
fieldContent = new IntArrayList();
if ( hasNonText ) nonTextZipDataOutputStream = new DataOutputStream( nonTextZipOutputStream = new ZipOutputStream( new FastBufferedOutputStream( new FileOutputStream( basenameSuffix + ZipDocumentCollection.ZIP_EXTENSION ) ) ) );
terms.clear();
terms.trim( Scan.INITIAL_TERM_MAP_SIZE );
if ( exact ) {
nonTerms.clear();
nonTerms.trim( Scan.INITIAL_TERM_MAP_SIZE );
}
words = fields = bitsForWords = bitsForNonWords = bitsForFieldLengths = bitsForUris = bitsForTitles = documents = 0;
// First offset
documentOffsetsObs.writeDelta( 0 );
termOffsetsObs.writeDelta( 0 );
if ( exact ) nonTermOffsetsObs.writeDelta( 0 );
}
public void add( MutableString word, MutableString nonWord ) throws IOException {
int t = terms.getInt( word );
if ( t == -1 ) {
terms.put( word.copy(), t = terms.size() );
termsOutputStream.resetByteCount();
word.writeSelfDelimUTF8( termsOutputStream );
termOffsetsObs.writeLongDelta( termsOutputStream.getByteCount() );
}
fieldContent.add( t );
if ( exact ) {
t = nonTerms.getInt( nonWord );
if ( t == -1 ) {
nonTerms.put( nonWord.copy(), t = nonTerms.size() );
nonTermsOutputStream.resetByteCount();
nonWord.writeSelfDelimUTF8( nonTermsOutputStream );
nonTermOffsetsObs.writeLongDelta( nonTermsOutputStream.getByteCount() );
}
fieldContent.add( t );
}
}
public void close() throws IOException {
documentsOutputBitStream.close();
termsOutputStream.close();
IOUtils.closeQuietly( nonTermsOutputStream );
documentOffsetsObs.close();
termOffsetsObs.close();
if ( nonTermOffsetsObs != null ) nonTermOffsetsObs.close();
if ( hasNonText ) {
if ( documents == 0 ) nonTextZipOutputStream.putNextEntry( new ZipEntry( "dummy" ) );
nonTextZipDataOutputStream.close();
}
final SimpleCompressedDocumentCollection simpleCompressedDocumentCollection = new SimpleCompressedDocumentCollection( basenameSuffix, documents, terms.size(), nonTerms != null ? nonTerms.size() : -1, exact, factory );
BinIO.storeObject( simpleCompressedDocumentCollection, basenameSuffix + DocumentCollection.DEFAULT_EXTENSION );
simpleCompressedDocumentCollection.close();
final PrintStream stats = new PrintStream( new FileOutputStream ( basenameSuffix + SimpleCompressedDocumentCollection.STATS_EXTENSION ) );
final long overallBits = bitsForTitles + bitsForUris + bitsForFieldLengths + bitsForWords + bitsForNonWords;
stats.println( "Documents: " + Util.format( documents ) + " (" + Util.format( overallBits ) + ", " + Util.format( overallBits / (double)documents ) + " bits per document)" );
stats.println( "Terms: " + Util.format( terms.size() ) + " (" + Util.format( words ) + " words, " + Util.format( bitsForWords ) + " bits, " + Util.format( bitsForWords / (double)words ) + " bits per word)" );
if ( exact ) stats.println( "Nonterms: " + Util.format( nonTerms.size() ) + " (" + Util.format( words ) + " nonwords, " + Util.format( bitsForNonWords ) + " bits, " + Util.format( bitsForNonWords / (double)words ) + " bits per nonword)" );
stats.println( "Bits for field lengths: " + Util.format( bitsForFieldLengths ) + " (" + Util.format( bitsForFieldLengths / (double)fields ) + " bits per field)" );
stats.println( "Bits for URIs: " + Util.format( bitsForUris ) + " (" + Util.format( bitsForUris / (double)documents ) + " bits per URI)" );
stats.println( "Bits for titles: " + Util.format( bitsForTitles ) + " (" + Util.format( bitsForTitles / (double)documents ) + " bits per title)" );
stats.close();
}
public void endDocument() throws IOException {
documentOffsetsObs.writeLongDelta( documentsOutputBitStream.writtenBits() );
if ( hasNonText ) nonTextZipOutputStream.closeEntry();
}
public void endTextField() throws IOException {
final int size = fieldContent.size();
words += size / ( exact ? 2 : 1 );
bitsForFieldLengths += documentsOutputBitStream.writeDelta( size / ( exact ? 2 : 1 ) );
termsFrequencyKeeper.reset();
if ( exact ) {
nonTermsFrequencyKeeper.reset();
for( int i = 0; i < size; i += 2 ) {
bitsForWords += documentsOutputBitStream.writeDelta( termsFrequencyKeeper.encode( fieldContent.getInt( i ) ) );
bitsForNonWords += documentsOutputBitStream.writeDelta( nonTermsFrequencyKeeper.encode( fieldContent.getInt( i + 1 ) ) );
}
}
else for( int i = 0; i < size; i++ ) bitsForWords += documentsOutputBitStream.writeDelta( termsFrequencyKeeper.encode( fieldContent.getInt( i ) ) );
}
public void nonTextField( Object o ) throws IOException {
final ObjectOutputStream oos = new ObjectOutputStream( nonTextZipDataOutputStream );
oos.writeObject( o );
oos.flush();
}
public static int writeSelfDelimitedUtf8String( final OutputBitStream obs, final CharSequence s ) throws IOException {
final int len = s.length();
int bits = 0;
bits += obs.writeDelta( len );
for( int i = 0; i < len; i++ ) bits += obs.writeZeta( s.charAt( i ), 7 );
return bits;
}
public void startDocument( CharSequence title, CharSequence uri ) throws IOException {
documentsOutputBitStream.writtenBits( 0 );
bitsForUris += writeSelfDelimitedUtf8String( documentsOutputBitStream, uri == null ? "" : uri );
bitsForTitles += writeSelfDelimitedUtf8String( documentsOutputBitStream, title );
if ( hasNonText ) {
final ZipEntry currEntry = new ZipEntry( Integer.toString( documents ) );
nonTextZipOutputStream.putNextEntry( currEntry );
}
documents++;
}
public void startTextField() {
fieldContent.size( 0 );
fields++;
}
public void virtualField( final ObjectList<VirtualDocumentFragment> fragments ) throws IOException {
nonTextZipDataOutputStream.writeInt( fragments.size() );
for ( VirtualDocumentFragment fragment: fragments ) {
fragment.documentSpecifier().writeSelfDelimUTF8( nonTextZipOutputStream );
fragment.text().writeSelfDelimUTF8( nonTextZipOutputStream );
}
}
@SuppressWarnings("unchecked")
public void build( final DocumentSequence inputSequence ) throws IOException {
final DocumentIterator docIt = inputSequence.iterator();
if ( factory != inputSequence.factory() ) throw new IllegalStateException( "The factory provided by the constructor does not correspond to the factory of the input sequence" );
final int numberOfFields = factory.numberOfFields();
WordReader wordReader;
MutableString word = new MutableString();
MutableString nonWord = new MutableString();
open( "" );
for (;;) {
Document document = docIt.nextDocument();
if ( document == null ) break;
startDocument( document.title(), document.uri() );
for ( int field = 0; field < numberOfFields; field++ ) {
Object content = document.content( field );
if ( factory.fieldType( field ) == FieldType.TEXT ) {
startTextField();
wordReader = document.wordReader( field );
wordReader.setReader( (Reader)content );
while ( wordReader.next( word, nonWord ) ) add( word, nonWord );
endTextField();
}
else if ( factory.fieldType( field ) == FieldType.VIRTUAL ) virtualField( (ObjectList<VirtualDocumentFragment>)content );
else nonTextField( content );
}
document.close();
endDocument();
}
docIt.close();
close();
}
}