Package it.unimi.dsi.mg4j.document

Source Code of it.unimi.dsi.mg4j.document.SimpleCompressedDocumentCollection$FrequencyCodec

package it.unimi.dsi.mg4j.document;

/*    
* MG4J: Managing Gigabytes for Java
*
* Copyright (C) 2009-2010 Sebastiano Vigna
*
*  This library is free software; you can redistribute it and/or modify it
*  under the terms of the GNU Lesser General Public License as published by the Free
*  Software Foundation; either version 3 of the License, or (at your option)
*  any later version.
*
*  This library is distributed in the hope that it will be useful, but
*  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
*  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
*  for more details.
*
*  You should have received a copy of the GNU Lesser General Public License
*  along with this program; if not, see <http://www.gnu.org/licenses/>.
*
*/


import it.unimi.dsi.fastutil.ints.AbstractIntComparator;
import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.ints.IntArrays;
import it.unimi.dsi.fastutil.io.FastBufferedInputStream;
import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
import it.unimi.dsi.fastutil.longs.AbstractLongIterator;
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;
import it.unimi.dsi.fastutil.objects.Reference2ObjectMaps;
import it.unimi.dsi.io.ByteBufferInputStream;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.InputBitStream;
import it.unimi.dsi.io.NullInputStream;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.io.WordReader;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.mg4j.document.DocumentFactory.FieldType;
import it.unimi.dsi.mg4j.tool.Scan.VirtualDocumentFragment;
import it.unimi.dsi.mg4j.util.parser.callback.AnchorExtractor;
import it.unimi.dsi.sux4j.util.EliasFanoMonotoneLongBigList;

import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
import java.nio.channels.FileChannel.MapMode;
import java.util.NoSuchElementException;
import java.util.zip.ZipFile;

import org.apache.commons.configuration.ConfigurationException;
import org.apache.commons.io.IOUtils;

import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.UnflaggedOption;

/** A basic, compressed document collection that can be easily built at indexing time.
*
* <p>Instances of this class record virtual and non-text fields just like {@link ZipDocumentCollection}&mdash;that is,
* in a zip file. However, text fields are recorded in a simple but highly efficient format. Terms (and nonterms) are numbered globally
* in an increasing way as they are met. While we scan each document, we keep track of frequencies for a limited number of terms:
* terms are encoded with their frequency rank if we know their statistics, or by a special code derived from their
* global number if we have no statistics about them. Every number involved is written in delta code.
*
* <p>A collection can be <em>exact</em> or <em>approximated</em>: in the latter case, nonwords will not be recorded, and will
* be turned into spaces when decompressing.
*
* <p>A instance of this collection will be, as any other collection, serialised on a file, but it will refer to several other files
* that are derived from the instance basename. Please use {@link AbstractDocumentSequence#load(CharSequence)}
* to load instances of this collection.
*
* <p>This class suffers the same scalability problem of {@link ZipDocumentCollection} if you compress non-text or virtual fields. Text
* compression, on the other hand, is extremely efficient and scalable.
* @author Sebastiano Vigna
*/

public class SimpleCompressedDocumentCollection extends AbstractDocumentCollection implements Serializable {
  private static final long serialVersionUID = 1L;

  private static final boolean DEBUG = false;
  protected static final boolean ASSERTS = false;

  /** Standard extension for the file containing encoded documents. */
  public static final String DOCUMENTS_EXTENSION = ".documents";
  /** Standard extension for the file containing document offsets stored as &delta;-encoded gaps. */
  public static final String DOCUMENT_OFFSETS_EXTENSION = ".docoffsets";
  /** Standard extension for the file containing terms in {@link MutableString#writeSelfDelimUTF8(java.io.OutputStream)} format. */
  public static final String TERMS_EXTENSION  = ".terms";
  /** Standard extension for the file containing term offsets stored as &delta;-encoded gaps. */
  public static final String TERM_OFFSETS_EXTENSION = ".termoffsets";
  /** Standard extension for the file containing nonterms in {@link MutableString#writeSelfDelimUTF8(java.io.OutputStream)} format. */
  public static final String NONTERMS_EXTENSION  = ".nonterms";
  /** Standard extension for the file containing nonterm offsets stored as &delta;-encoded gaps. */
  public static final String NONTERM_OFFSETS_EXTENSION = ".nontermoffsets";
  /** Standard extension for the stats file. */
  public static final String STATS_EXTENSION = ".stats";

  /** The basename of this collection. */
  private final String basename;
  /** Whether this collection is exact (i.e., whether it stores nonwords). */
  private final boolean exact;
  /** The number of documents in this collection. */
  private final long documents;
  /** The number of terms in this collection. */
  private final long terms;
  /** The number of nonterms in this collection, or -1 if {@link #exact} is false. */
  private final long nonTerms;
  /** The document offsets. */
  private transient EliasFanoMonotoneLongBigList docOffsets;
  /** The term offsets. */
  private transient EliasFanoMonotoneLongBigList termOffsets;
  /** The nonterm offsets, or <code>null</code> if {@link #exact} is false. */
  private transient EliasFanoMonotoneLongBigList nonTermOffsets;
  /** The input bit stream for documents. */
  private transient InputBitStream documentsInputBitStream;
  /** The input bit stream for terms. */
  private transient FastBufferedInputStream termsInputStream;
  /** The input bit stream for nonterms, or <code>null</code> if {@link #exact} is false. */
  private transient FastBufferedInputStream nonTermsInputStream;
  /** A frequency keeper used to decompress document terms. */
  private transient FrequencyCodec termsFrequencyKeeper;
  /** A frequency keeper used to decompress document nonterms, or <code>null</code> if {@link #exact} is false. */
  private transient FrequencyCodec nonTermsFrequencyKeeper;
  /** The underlying factory. */
  private final DocumentFactory factory;
  /** Whether this collection contains non-text or virtual fields. */
  private final boolean hasNonText;
  /** The zip file used to store non-text and virtual fields if {@link #hasNonText} is true, or  <code>null</code> if this collection does not store such fields. */
  private transient ZipFile zipFile;
  /** The input stream obtained by memory-mapping the file containing documents, or <code>null</code>. */
  private transient ByteBufferInputStream documentsByteBufferInputStream;
  /** The input stream obtained by memory-mapping the file containing terms, or <code>null</code>. */
  private transient ByteBufferInputStream termsByteBufferInputStream;
  /** The input stream obtained by memory-mapping the file containing nonterms, or <code>null</code>. */
  private transient ByteBufferInputStream nonTermsByteBufferInputStream;
  /** True if ancillary files have been all correctly opened. */
  private boolean fileOpenOk;
  /** True if memory mappings have been all been obtained. */
  private boolean fileMappingOk;

  /** An iterator used to load &delta;-encoded offset gaps. */
  private static final class OffsetsLongIterator extends AbstractLongIterator {
    private final long numberOfItems;
    private long currIndex;
    private long currValue;
    private final InputBitStream ibs;
   
    public OffsetsLongIterator( InputBitStream ibs, long numberOfItems ) {
      this.ibs = ibs;
      this.numberOfItems = numberOfItems;
    }
   
    public boolean hasNext() {
      return currIndex < numberOfItems;
    }
   
    @Override
    public long nextLong() {
      if ( ! hasNext() ) throw new NoSuchElementException();
      try {
        currIndex++;
        return currValue += ibs.readDelta();
      }
      catch ( IOException e ) {
        throw new RuntimeException( e );
      }
    }
  }

  /** A simple codec for integers that remaps frequent numbers to smaller numbers. */
  protected static class FrequencyCodec {
    /** The size of the symbol queue. */
    private final static int MAX_QUEUE_SIZE = 2048;
    /** The symbol queue. */
    private final int[] queue;
    /** An array parallel to  {@link  #queue} containing frequencies. */
    private final int[] freq;
    /** A map from input symbols to positions in {@link #queue}. */
    private final Int2IntOpenHashMap code2Pos;
    /** The current size of {{@link #queue}. */
    private int queueSize;
   
    public FrequencyCodec() {
      code2Pos = new Int2IntOpenHashMap();
      code2Pos.defaultReturnValue( -1 );
      queue = new int[ MAX_QUEUE_SIZE ];
      freq = new int[ MAX_QUEUE_SIZE ];
    }
   
    /** Empties the queue and the symbol-to-position map. */
    public void reset() {
      queueSize = 0;
      code2Pos.clear();
    }
   
    private final void newSymbol( final int symbol ) {
      if ( queueSize == MAX_QUEUE_SIZE ) {
        // Queue filled up. First, we guarantee that there are elements with frequency one.
        if ( freq[ MAX_QUEUE_SIZE -1 ] != 1 ) for( int j = MAX_QUEUE_SIZE; j-- != 0; ) freq[ j ] /= freq[ MAX_QUEUE_SIZE - 1 ];
        // Then, we remove half of them.
        int j = MAX_QUEUE_SIZE;
        while( j-- != 0 ) if ( freq[ j ] > 1 ) break;
        for( int k = j + ( MAX_QUEUE_SIZE - j ) / 2; k < MAX_QUEUE_SIZE; k++ ) {
          if ( ASSERTS ) assert freq[ k ] == 1;
          code2Pos.remove( queue[ k ] );
        }
        queueSize = j + ( MAX_QUEUE_SIZE - j ) / 2;
      }
     
      // Now we know that we have space.
      if ( ASSERTS ) assert queueSize < MAX_QUEUE_SIZE;
      code2Pos.put( symbol, queueSize );
      queue[ queueSize ] = symbol;
      freq[ queueSize ] = 1;
      queueSize++;
    }
   
    private final void oldSymbol( final int pos ) {
      // Term already in list
      // Find term to exchange for change of frequency
      int ex = pos;
      while( ex >= 0 && freq[ ex ] == freq[ pos ] ) ex--;
      ++ex;
      freq[ pos ]++;
      // Exchange
      int t = queue[ pos ];
      queue[ pos ] = queue[ ex ];
      queue[ ex ] = t;
      t = freq[ pos ];
      freq[ pos ] = freq[ ex ];
      freq[ ex ] = t;
      code2Pos.put( queue[ ex ], ex );
      code2Pos.put( queue[ pos ], pos );
    }
   
    /** Encodes a symbol, returning a (hopefully smaller) symbol.
     *
     * @param symbol the input symbol.
     * @return the output symbol.
     */
    public int encode( final int symbol ) {
      final int pos = code2Pos.get( symbol );
      if ( pos == -1 ) {
        final int result = queueSize + symbol;
        newSymbol( symbol );
        return result;
      }
      else {
        if ( DEBUG ) System.err.println( "Symbol " + symbol + " in list; writing " + pos  + " " + code2Pos + " "  + IntArrayList.wrap( queue, queueSize ) + " " + IntArrayList.wrap( freq, queueSize ) );
        oldSymbol( pos );
        return pos;
      }
    }

    /** Decodes a symbol, returning the original symbol.
     *
     * @param symbol a symbol an encoded file.
     * @return the corresponding original input symbol.
     */
    public int decode( final int symbol ) {
     
      if ( symbol < queueSize ) {
        final int result = queue[ symbol ];
        oldSymbol( symbol );
        return result;
      }
      else {
        int term = symbol - queueSize;
        newSymbol( term );
        return term;
      }
    }
  }
 
  private SimpleCompressedDocumentCollection( String basename, DocumentFactory factory,
      EliasFanoMonotoneLongBigList docOffsets, EliasFanoMonotoneLongBigList termOffsets, EliasFanoMonotoneLongBigList nonTermOffsets, ByteBufferInputStream documentsByteBufferInputStream,
      ByteBufferInputStream termsByteBufferInputStream, ByteBufferInputStream nonTermsByteBufferInputStream ) {
    this.basename = basename;
    this.documents = docOffsets.length() - 1;
    this.terms = termOffsets.length() - 1;
    this.exact = nonTermOffsets != null;
    this.nonTerms = exact ? termOffsets.length() - 1 : -1;
    this.docOffsets = docOffsets;
    this.termOffsets = termOffsets;
    this.nonTermOffsets = nonTermOffsets;
    this.factory = factory;
    this.termsFrequencyKeeper = new FrequencyCodec();
    this.nonTermsFrequencyKeeper = exact ? new FrequencyCodec() : null;
    this.documentsByteBufferInputStream = documentsByteBufferInputStream;
    this.termsByteBufferInputStream = termsByteBufferInputStream;
    this.nonTermsByteBufferInputStream = nonTermsByteBufferInputStream;
    this.hasNonText = hasNonText( factory );

  }

  protected SimpleCompressedDocumentCollection( final String basename, final long documents, final long terms, final long nonTerms, final boolean exact, final DocumentFactory factory ) {
    this.hasNonText = hasNonText( factory );
    this.basename = basename;
    this.documents = documents;
    this.terms = terms;
    this.nonTerms = nonTerms;
    this.exact = exact;
    this.factory = factory;
    this.termsFrequencyKeeper = null;
    this.nonTermsFrequencyKeeper = null;
    docOffsets = termOffsets = nonTermOffsets = null;
    documentsInputBitStream = null;
    termsInputStream = nonTermsInputStream = null;
    zipFile = null;
    try {
      super.close();
    }
    catch ( IOException cantHappen ) {
      throw new RuntimeException( cantHappen );
    }
  }

  private static boolean hasNonText( final DocumentFactory factory ) {
    boolean hasNonText = false;
    for( int i = factory.numberOfFields(); i-- != 0; ) hasNonText |= factory.fieldType( i ) != FieldType.TEXT;
    return hasNonText;
  }

  private void initMappings( final String basename, final boolean rethrow ) throws IOException {
    try {
      // TODO: This is too risky: we will have to make it optional at some point
      // documentsByteBufferInputStream = ByteBufferInputStream.map( new FileInputStream( basename + DOCUMENTS_EXTENSION ).getChannel(), MapMode.READ_ONLY );
      termsByteBufferInputStream = ByteBufferInputStream.map( new FileInputStream( basename + TERMS_EXTENSION ).getChannel(), MapMode.READ_ONLY );
      nonTermsByteBufferInputStream = nonTermOffsets != null ? ByteBufferInputStream.map( new FileInputStream( basename + NONTERMS_EXTENSION ).getChannel(), MapMode.READ_ONLY ) : null;
      fileMappingOk = true;
    }
    catch( IOException e ) {
      // We leave the possibility for a filename() to fix the problem and map the files.
      if ( rethrow ) throw e;
    }
  }
 
  private void initFiles( final String basename, final boolean rethrow ) throws IOException {
    try {
      docOffsets = loadOffsetsSuccinctly( basename + DOCUMENT_OFFSETS_EXTENSION, documents, new File( basename + DOCUMENTS_EXTENSION ).length() * Byte.SIZE + 1 );
      termOffsets = loadOffsetsSuccinctly( basename + TERM_OFFSETS_EXTENSION, terms, new File( basename + TERMS_EXTENSION ).length() + 1 );
      nonTermOffsets = nonTerms < 0 ? null : loadOffsetsSuccinctly( basename + NONTERM_OFFSETS_EXTENSION, nonTerms, new File( basename + NONTERMS_EXTENSION ).length() + 1 );

      documentsInputBitStream = documentsByteBufferInputStream != null ? new InputBitStream( documentsByteBufferInputStream ) : new InputBitStream( basename + DOCUMENTS_EXTENSION );
      termsInputStream = new FastBufferedInputStream( termsByteBufferInputStream != null ? termsByteBufferInputStream : new FileInputStream( basename + TERMS_EXTENSION ) );
      nonTermsInputStream = exact ? new FastBufferedInputStream( nonTermsByteBufferInputStream != null ? nonTermsByteBufferInputStream : new FileInputStream( basename + NONTERMS_EXTENSION ) ) : null;
      zipFile = hasNonText ? new ZipFilebasename + ZipDocumentCollection.ZIP_EXTENSION ) : null;
      fileOpenOk = true;
    }
    catch( IOException e ) {
      // We leave the possibility for a filename() to fix the problem and load the right files.
      if ( rethrow ) throw e;
    }
  }
 
  private void ensureFiles() {
    if ( ! fileOpenOk ) throw new IllegalStateException( "Some of the files used by this " + SimpleCompressedDocumentCollection.class.getSimpleName() + " have not been loaded correctly; please use " + AbstractDocumentSequence.class.getName() + ".load() or call filename() after deserialising this instance" );
  }
 
  private static EliasFanoMonotoneLongBigList loadOffsetsSuccinctly( final CharSequence filename, final long numberOfItems, final long upperBound ) throws IOException {
    final InputBitStream ibs = new InputBitStream( filename.toString() );
    final EliasFanoMonotoneLongBigList offsets = new EliasFanoMonotoneLongBigList( numberOfItems + 1, upperBound, new OffsetsLongIterator( ibs, numberOfItems + 1 ) );
    ibs.close();
    return offsets;
  }
   
  @Override
  public void filename( CharSequence filename ) throws IOException {
    if ( ! fileMappingOk ) initMappings( new File( new File( filename.toString() ).getParentFile(), basename ).toString(), true );
    if ( ! fileOpenOk ) initFiles( new File( new File( filename.toString() ).getParentFile(), basename ).toString(), true );
  }
 
  public DocumentCollection copy() {
    ensureFiles();
    try {
      SimpleCompressedDocumentCollection copy = new SimpleCompressedDocumentCollection(
          basename,
          factory.copy(),
          docOffsets,
          termOffsets,
          nonTermOffsets,
          documentsByteBufferInputStream != null ? documentsByteBufferInputStream.copy() : null,
          termsByteBufferInputStream != null ? termsByteBufferInputStream.copy() : null,
          nonTermsByteBufferInputStream != null ? nonTermsByteBufferInputStream.copy() : null );
      copy.initFiles( basename, true );
      return copy;
    }
    catch ( IOException e ) {
      throw new RuntimeException( e );
    }
  }

  private static MutableString readSelfDelimitedUtf8String( final InputBitStream ibs, final MutableString s ) throws IOException {
    s.length( 0 );
    for( int length = ibs.readDelta(); length-- != 0; ) s.append( (char)ibs.readZeta( 7 ) );
    return s;
  }
 
 
  public Document document( int index ) throws IOException {
    ensureDocumentIndex( index );
    ensureFiles();
    documentsInputBitStream.position( docOffsets.getLong( index ) );
    final DataInputStream nonTextDataInputStream = hasNonText ? new DataInputStream( new FastBufferedInputStream( zipFile.getInputStream( zipFile.getEntry( Integer.toString( index ) ) ) ) ) : null;
    final MutableString uri = readSelfDelimitedUtf8String( documentsInputBitStream, new MutableString() );
    final MutableString title = readSelfDelimitedUtf8String( documentsInputBitStream, new MutableString() );

    return new AbstractDocument() {
      final MutableString fieldContent = new MutableString();
     
      @SuppressWarnings("unchecked")
      final Document fakeDocument = factory.getDocument( NullInputStream.getInstance(), Reference2ObjectMaps.EMPTY_MAP );
     
      int nextField = 0;

      public Object content( int field ) throws IOException {
        FieldType fieldType = factory.fieldType( field );

        if ( nextField > field ) throw new IllegalStateException();
        // Skip fields
        final MutableString s = new MutableString();
        int len;
        while( nextField < field ) {
          switch( fieldType ) {
          case TEXT:
            len = documentsInputBitStream.readDelta();
            if ( exact ) len *= 2;
            documentsInputBitStream.skipDeltas( len );
            break;
          case VIRTUAL:
            final int nfrag = nonTextDataInputStream.readInt();
            for ( int i = 0; i < 2 * nfrag; i++ ) MutableString.skipSelfDelimUTF8( nonTextDataInputStream );
            break;
          default:
            try { new ObjectInputStream( nonTextDataInputStream ).readObject(); } catch ( ClassNotFoundException e ) { throw new RuntimeException( e ); }
          }
          nextField++;
        }
       
        // Read field
        nextField++;

        switch( fieldType ) {
        case TEXT:
          len = documentsInputBitStream.readDelta();
          fieldContent.length( 0 );

          termsFrequencyKeeper.reset();
          if ( exact ) nonTermsFrequencyKeeper.reset();

          while( len-- != 0 ) {
            termsInputStream.position( termOffsets.getLong( termsFrequencyKeeper.decode( documentsInputBitStream.readDelta() ) ) );
            s.readSelfDelimUTF8( termsInputStream );
            fieldContent.append( s );
            if ( exact ) {
              nonTermsInputStream.position( nonTermOffsets.getLong( nonTermsFrequencyKeeper.decode( documentsInputBitStream.readDelta() ) ) );
              s.readSelfDelimUTF8( nonTermsInputStream );
              fieldContent.append( s );
            }
            else fieldContent.append( ' ');
          }
          return new FastBufferedReader( fieldContent );
        case VIRTUAL:
          final int nfrag = nonTextDataInputStream.readInt();
          MutableString doc = new MutableString();
          MutableString text = new MutableString();
          VirtualDocumentFragment[] fragArray = new VirtualDocumentFragment[ nfrag ];
          for ( int i = 0; i < nfrag; i++ ) {
            doc.readSelfDelimUTF8( (InputStream)nonTextDataInputStream );
            text.readSelfDelimUTF8( (InputStream)nonTextDataInputStream );
            fragArray[ i ] = new AnchorExtractor.Anchor( doc.copy(), text.copy() );
          }
          return new ObjectArrayList<VirtualDocumentFragment>( fragArray );

        default:
          try { return new ObjectInputStream( nonTextDataInputStream ).readObject(); } catch ( ClassNotFoundException e ) { throw new RuntimeException( e ); }
        }
     
       
      }

      public CharSequence title() {
        return title;
      }

      public CharSequence uri() {
        return uri.length() == 0 ? null : uri;
      }

      public WordReader wordReader( int field ) {
        switch( factory.fieldType( field ) ) {
        case TEXT:
        case VIRTUAL: return fakeDocument.wordReader( field );
        default: return null;
        }
      }
     
      public void close() throws IOException {
        super.close();
        if ( hasNonText ) nonTextDataInputStream.close();
      }

    };
  }

 
  public Reference2ObjectMap<Enum<?>,Object> metadata( int index ) throws IOException {
    throw new UnsupportedOperationException();
  }

 
  public int size() {
    return (int)documents;
  }

 
  public InputStream stream( int index ) throws IOException {
    throw new UnsupportedOperationException();
  }
 
  public void close() throws IOException {
    super.close();
    if ( documentsInputBitStream != null ) documentsInputBitStream.close();
    IOUtils.closeQuietly( termsInputStream );
    IOUtils.closeQuietly( nonTermsInputStream );
  }

  public DocumentFactory factory() {
    return factory;
  }

  private void readObject( final ObjectInputStream s ) throws IOException, ClassNotFoundException {
    s.defaultReadObject();
    initMappings( basename, false );
    initFiles( basename, false );
    termsFrequencyKeeper = new FrequencyCodec();
    if ( exact ) nonTermsFrequencyKeeper = new FrequencyCodec();
  }
   
 
  // Unfinished, experimental method
  public static void optimize( final CharSequence basename ) throws IOException, ClassNotFoundException {
    final SimpleCompressedDocumentCollection collection = (SimpleCompressedDocumentCollection)AbstractDocumentCollection.load( basename );
    final long[] termFrequency = new long[ (int)collection.terms ];
    final long[] nonTermFrequency = collection.exact ? new long[ (int)collection.nonTerms ] : null;
    final InputBitStream documentsIbs = collection.documentsInputBitStream;
    final DocumentFactory factory = collection.factory;
    final boolean exact = collection.exact;
    final MutableString s = new MutableString();
    documentsIbs.position( 0 );
    for( int i = (int)collection.documents; i-- != 0; ) {
      readSelfDelimitedUtf8String( documentsIbs, s ); // Skip URI
      readSelfDelimitedUtf8String( documentsIbs, s ); // Skip title
      for( int f = factory.numberOfFields() - 1; f-- !=0; ) {
        int len = documentsIbs.readDelta();
        while( len-- != 0 ) {
          termFrequency[ documentsIbs.readDelta() ]++;
          if ( exact ) nonTermFrequency[ documentsIbs.readDelta() ]++;
        }
      }
    }
   
    int[] termPerm = new int[ termFrequency.length ];
    for( int i = termPerm.length; i-- != 0; ) termPerm[ i ] = i;
    IntArrays.quickSort( termPerm, 0, termPerm.length, new AbstractIntComparator() {
      public int compare( int arg0, int arg1 ) {
        return termFrequency[ arg1 ] - termFrequency[ arg0 ] < 0 ? -1 : termFrequency[ arg1 ] == termFrequency[ arg0 ] ? 0 : 1;
      }
    });
   
    int[] invTermPerm = new int[ termFrequency.length ];
    for( int i = invTermPerm.length; i-- != 0; ) invTermPerm[ termPerm[ i ] ] = i;
   
    int[] nonTermPerm = null, invNonTermPerm = null;
    if ( exact ) {
      nonTermPerm = new int[ termFrequency.length ];
      for( int i = nonTermPerm.length; i-- != 0; ) nonTermPerm[ i ] = i;
      IntArrays.quickSort( nonTermPerm, 0, nonTermPerm.length, new AbstractIntComparator() {
        public int compare( int arg0, int arg1 ) {
          return termFrequency[ arg1 ] - termFrequency[ arg0 ] < 0 ? -1 : termFrequency[ arg1 ] == termFrequency[ arg0 ] ? 0 : 1;
        }
      });
      invNonTermPerm = new int[ nonTermFrequency.length ];
      for( int i = invNonTermPerm.length; i-- != 0; ) invNonTermPerm[ nonTermPerm[ i ] ] = i;
    }

    File newDocumentsFile = File.createTempFile( SimpleCompressedDocumentCollection.class.getSimpleName(), "temp", new File( basename.toString() ).getParentFile() );
    OutputBitStream newDocumentsObs = new OutputBitStream( newDocumentsFile );
    documentsIbs.position( 0 );
    for( int i = (int)collection.documents; i-- != 0; ) {
      readSelfDelimitedUtf8String( documentsIbs, s ); // Skip URI
      SimpleCompressedDocumentCollectionBuilder.writeSelfDelimitedUtf8String( newDocumentsObs, s );
      readSelfDelimitedUtf8String( documentsIbs, s ); // Skip title
      SimpleCompressedDocumentCollectionBuilder.writeSelfDelimitedUtf8String( newDocumentsObs, s );
      for( int f = factory.numberOfFields() - 1; f-- !=0; ) {
        int len = documentsIbs.readDelta();
        newDocumentsObs.writeDelta( len );
        while( len-- != 0 ) {
          newDocumentsObs.writeDelta( invTermPerm[ documentsIbs.readDelta() ] );
          if ( exact ) newDocumentsObs.writeDelta( invNonTermPerm[ documentsIbs.readDelta() ] );
        }
      }
    }
    newDocumentsObs.close();
    new File( basename + DOCUMENTS_EXTENSION ).delete();
    newDocumentsFile.renameTo( new File( basename + DOCUMENTS_EXTENSION ) );
    newDocumentsObs = null;
    invTermPerm = invNonTermPerm = null;
   
    FastBufferedInputStream termsStream = new FastBufferedInputStream( new FileInputStream( basename + TERMS_EXTENSION ) ) ;
    MutableString term[] = new MutableString[ (int)collection.terms ];
    for( int i = 0; i < term.length; i++ ) term[ i ] = new MutableString().readSelfDelimUTF8( termsStream );
    termsStream.close();

    new FastBufferedOutputStream( new FileOutputStream( basename + TERMS_EXTENSION ) );
  }

  public static void main( final String[] arg ) throws IOException, JSAPException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException, ConfigurationException, ClassNotFoundException {

    SimpleJSAP jsap = new SimpleJSAP( FileSetDocumentCollection.class.getName(), "Optimises a simple compressed document collection.",
        new Parameter[] {
          new UnflaggedOption( "basename", JSAP.STRING_PARSER, JSAP.REQUIRED, "The filename of the collection." ),
        }
    );
   
    JSAPResult jsapResult = jsap.parse( arg );
    if ( jsap.messagePrinted() ) return;
   
    optimize( jsapResult.getString( "basename" ) );
  }

}
TOP

Related Classes of it.unimi.dsi.mg4j.document.SimpleCompressedDocumentCollection$FrequencyCodec

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.