Package it.unimi.dsi.io

Examples of it.unimi.dsi.io.InputBitStream


      if ( index[ i ].properties.getLong( Index.PropertyKeys.OCCURRENCES, -1 ) == -1 ) numberOfOccurrences = -1;
      if ( numberOfOccurrences != -1 ) numberOfOccurrences += index[ i ].properties.getLong( Index.PropertyKeys.OCCURRENCES );
      final File globCountsFile = new File( this.inputBasename[ i ] + DiskBasedIndex.GLOBCOUNTS_EXTENSION );
      writeGlobCounts &= globCountsFile.exists();
      someGlobCounts |= globCountsFile.exists();
      if ( writeGlobCounts ) globCounts[ i ] = new InputBitStream( globCountsFile );

      if ( ! metadataOnly ) {
        final File offsetsFile = new File( this.inputBasename[ i ] + DiskBasedIndex.OFFSETS_EXTENSION );
        allDataForSizeComputation &= offsetsFile.exists();
        if ( quantum < 0 && allDataForSizeComputation ) offsets[ i ] = new InputBitStream( offsetsFile );

        if ( index[ i ].hasPositions ) {
          final File positionsLengthsFile = new File( this.inputBasename[ i ] + DiskBasedIndex.POSITIONS_NUMBER_OF_BITS_EXTENSION );
          allDataForSizeComputation &= positionsLengthsFile.exists();
          if ( quantum < 0 && allDataForSizeComputation ) posNumBits[ i ] = new InputBitStream( positionsLengthsFile );
        }
      }
     
      final File sizesFile = new File( this.inputBasename[ i ] + DiskBasedIndex.SIZES_EXTENSION );
      writeSizes &= sizesFile.exists();
View Full Code Here


   */
 
  protected IntIterator sizes( int numIndex ) throws FileNotFoundException {
    if ( index[ numIndex ].sizes != null ) return index[ numIndex ].sizes.listIterator();
    LOGGER.debug( "Reading sizes from " + inputBasename[ numIndex ] + DiskBasedIndex.SIZES_EXTENSION );
    return new GammaCodedIntIterator( new InputBitStream( inputBasename[ numIndex ] + DiskBasedIndex.SIZES_EXTENSION ) );
  }
View Full Code Here

    return new FileInputStream( indexFile );
  }

  @Override
  public InputBitStream getInputBitStream( final int bufferSize ) throws FileNotFoundException {
    return new InputBitStream( indexFile, bufferSize );
  }
View Full Code Here

    return new FileInputStream( positionsFile );
  }

  @Override
  public InputBitStream getPositionsInputBitStream( int bufferSize ) throws IOException {
    return new InputBitStream( positionsFile, bufferSize );
  }
View Full Code Here

    super( outputBasename, inputBasename, metadataOnly, incremental, bufferSize, writerFlags, interleaved, skips, quantum, height, skipBufferSize, logInterval );
    this.incremental = incremental;

    tempFile = File.createTempFile( "MG4J", ".data", tempFileDir );
    cacheBitStreamOut = new CachingOutputBitStream( tempFile, tempBufferSize );
    cacheBitStreamIn = new InputBitStream( tempFile, bufferSize );
    cacheBitStreamInWrapper = new InputBitStream( cacheBitStreamOut.buffer() );
    /* In this case, we must reallocate position as by merging occurences we might
     * obtain an occurrence list as large as the concatenation of all largest
     * lists. We use this estimate to allocate position, and update maxCount in
     * combine() to get the real maxCount. */
    int estimateForMaxCount = 0, tempSize = 0;
View Full Code Here

      if ( p != 0 ) variableQuantumIndexWriter.newInvertedList( totalFrequency, p, predictedSize, predictedLengthNumBits );
      else indexWriter.newInvertedList();

      indexWriter.writeFrequency( totalFrequency );
      cacheBitStreamOut.align();
      final InputBitStream ibs;

      if ( cacheBitStreamOut.buffer() != null ) ibs = cacheBitStreamInWrapper;
      else {
        cacheBitStreamOut.flush();
        ibs = cacheBitStreamIn;
        ibs.flush();
      }

      ibs.position( 0 );

      currDoc = -1;
      for( int j = totalFrequency; j-- != 0; ) {
        obs = indexWriter.newDocumentRecord();
        indexWriter.writeDocumentPointer( obs, currDoc = ibs.readDelta() + currDoc + 1 );
        if ( hasCounts ) {
          count = ibs.readGamma();
          indexWriter.writePositionCount( obs, count );
          if ( hasPositions ) {
            position[ 0 ] = ibs.readDelta();
            for( int k = 1; k < count; k++ ) position[ k ] = position[ k - 1 ] + ibs.readDelta() + 1;
            indexWriter.writeDocumentPositions( obs, position, 0, count, size != null ? size[ currDoc ] : -1 );
          }
        }
      }
View Full Code Here

  @SuppressWarnings("unchecked")
  public BM25FScorer( String... arg ) throws NumberFormatException, FileNotFoundException, IOException, ClassNotFoundException {
    this(
        Double.parseDouble( arg[ 0 ] ), // k1
        arg[ 1 ].length() == 0 ? null : (StringMap<? extends CharSequence>)BinIO.loadObject( arg[ 1 ] ), // termMap
        arg[ 2 ].length() == 0 ? null : new SemiExternalGammaList( new InputBitStream( arg[ 2 ] ) ), // frequencies
        parseBArray( arg )
    );
  }
View Full Code Here

    this.index = index;
  }

  @Override
  public InputBitStream getInputBitStream( int bufferSizeUnused ) {
    return new InputBitStream( getInputStream() );
  }
View Full Code Here

      LOGGER.debug( "Generating index " + batchBasename + "; documents: " + documentCount );

      try {
        accumulator.flush();
        final InputBitStream ibs = new InputBitStream( accumulatorStream.array );
        final IndexWriter indexWriter = new BitStreamIndexWriter( batchBasename, indexingType == IndexingType.STANDARD ? documentCount : maxDocInBatch + 1, true, flags );
        indexWriter.newInvertedList();
        indexWriter.writeFrequency( documentCount );
        OutputBitStream obs;

        if ( indexingType == IndexingType.STANDARD ) {
          for ( int i = 0; i < documentCount; i++ ) {
            obs = indexWriter.newDocumentRecord();
            indexWriter.writeDocumentPointer( obs, i );
            payload.read( ibs );
            indexWriter.writePayload( obs, payload );
          }
        }
        else {
          // We sort position by pointed document pointer.
          LongArrays.quickSort( position, 0, documentCount, new AbstractLongComparator() {
            public int compare( final long position0, final long position1 ) {
              try {
                ibs.position( position0 );
                final int d0 = ibs.readDelta();
                ibs.position( position1 );
                return d0 - ibs.readDelta();
              }
              catch ( IOException e ) {
                throw new RuntimeException( e );
              }
            }
          } );
          for ( int i = 0; i < documentCount; i++ ) {
            obs = indexWriter.newDocumentRecord();
            ibs.position( position[ i ] );
            indexWriter.writeDocumentPointer( obs, ibs.readDelta() );
            payload.read( ibs );
            indexWriter.writePayload( obs, payload );
          }

          maxDocInBatch = 0;
View Full Code Here

    final long numberOfOccurrences[] = new long[ numIndices ];
    final long numberOfPostings[] = new long[ numIndices ];
   
    final boolean isHighPerformance = BitStreamHPIndex.class.isAssignableFrom( Class.forName( new Properties( inputBasename + DiskBasedIndex.PROPERTIES_EXTENSION ).getString( Index.PropertyKeys.INDEXCLASS ) ) );
   
    final InputBitStream globalIndex = new InputBitStream( inputBasename + DiskBasedIndex.INDEX_EXTENSION, bufferSize );
    final long globalPositionsLength = new File( inputBasename + DiskBasedIndex.POSITIONS_EXTENSION ).length();
    final InputBitStream globalPositions = isHighPerformance ? new InputBitStream( inputBasename + DiskBasedIndex.POSITIONS_EXTENSION, bufferSize ) : null;
    final FastBufferedReader terms = new FastBufferedReader( new InputStreamReader( new FileInputStream( inputBasename + DiskBasedIndex.TERMS_EXTENSION ), "UTF-8" ) );
    final InputBitStream offsets = new InputBitStream( inputBasename + DiskBasedIndex.OFFSETS_EXTENSION );
   
    final File posNumBitsFile = new File( inputBasename + DiskBasedIndex.POSITIONS_NUMBER_OF_BITS_EXTENSION );
    final InputBitStream posNumBits = posNumBitsFile.exists() ? new InputBitStream( inputBasename + DiskBasedIndex.POSITIONS_NUMBER_OF_BITS_EXTENSION ) : null;
    final InputBitStream frequencies = new InputBitStream( inputBasename + DiskBasedIndex.FREQUENCIES_EXTENSION );
    final InputBitStream globCounts = new InputBitStream( inputBasename + DiskBasedIndex.GLOBCOUNTS_EXTENSION );
    offsets.readGamma();
   
    for( int i = 0; i < numIndices; i++ ) {
      localIndexStream[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.INDEX_EXTENSION, bufferSize );
      if ( isHighPerformance ) localPositionsStream[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.POSITIONS_EXTENSION, bufferSize );
      localFrequencies[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.FREQUENCIES_EXTENSION );
      localGlobCounts[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.GLOBCOUNTS_EXTENSION );
      localTerms[ i ] = new PrintWriter( new OutputStreamWriter( new FastBufferedOutputStream( new FileOutputStream( localBasename[ i ] + DiskBasedIndex.TERMS_EXTENSION ) ), "UTF-8" ) );
      localOffsets[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.OFFSETS_EXTENSION );
      if ( posNumBits != null ) localPosNumBits[ i ] = new OutputBitStream( localBasename[ i ] + DiskBasedIndex.POSITIONS_NUMBER_OF_BITS_EXTENSION );
      localOffsets[ i ].writeGamma( 0 );
    }

    // The current term
    final MutableString currTerm = new MutableString();
   
    pl.expectedUpdates = ( new File( inputBasename + DiskBasedIndex.INDEX_EXTENSION ).length() + ( isHighPerformance ? new File( inputBasename + DiskBasedIndex.POSITIONS_EXTENSION ).length() : 0 ) ) * 8;
    pl.itemsName = "bits";
    pl.logInterval = logInterval;
    pl.start( "Partitioning index..." );

    int termNumber = 0, k, prevK = -1, previousHeaderLength = 0, newHeaderLength = 0;
    long length, count, positionsOffset = 0;
    int res, frequency;
   
    while( terms.readLine( currTerm ) != null ) {
      k = strategy.localIndex( termNumber ); // The local index for this term
      if ( numTerms[ k ] != strategy.localNumber( termNumber ) ) throw new IllegalStateException();
      numTerms[ k ]++;
     
      if ( isHighPerformance ) {
        final long temp = globalIndex.readBits();
        positionsOffset = globalIndex.readLongDelta();
        previousHeaderLength = (int)( globalIndex.readBits() - temp );
        if ( prevK != -1 ) {
          length = positionsOffset - globalPositions.readBits();
          pl.count += length;
          while( length > 0 ) {
            res = (int)Math.min( bufferSize * 8, length );
            globalPositions.read( buffer, res );
            localPositionsStream[ prevK ].write( buffer, res );
            length -= res;
          }
        }
        newHeaderLength = localIndexStream[ k ].writeLongDelta( localPositionsStream[ k ].writtenBits() );
      }
     
     
      frequency = frequencies.readGamma();
      localFrequencies[ k ].writeGamma( frequency );
      numberOfPostings[ k ] += frequency;

      if ( posNumBits != null ) localPosNumBits[ k ].writeGamma( posNumBits.readGamma() );
     
      count = globCounts.readLongGamma();
      numberOfOccurrences[ k ] += count;
      localGlobCounts[ k ].writeLongGamma( count );
     
      currTerm.println( localTerms[ k ] );
     
      length = offsets.readLongGamma() - previousHeaderLength;
      localOffsets[ k ].writeLongGamma( length + newHeaderLength );
      pl.count += length + previousHeaderLength - 1;
     
      while( length > 0 ) {
        res = (int)Math.min( bufferSize * 8, length );
        globalIndex.read( buffer, res );
        localIndexStream[ k ].write( buffer, res );
        length -= res;
      }
     
      pl.update();
      prevK = k;
      termNumber++;
    }

    // We pour the last piece of positions
    if ( isHighPerformance ) {
      if ( prevK != -1 ) {
        length = globalPositionsLength * 8 - globalPositions.readBits();
        System.err.println( globalPositionsLength * 8 - globalPositions.readBits() );
        while( length > 0 ) {
          res = (int)Math.min( bufferSize * 8, length );
          globalPositions.read( buffer, res );
          localPositionsStream[ prevK ].write( buffer, res );
          length -= res;
        }
      }
    }

    pl.done();

    terms.close();
    offsets.close();
    frequencies.close();
    globCounts.close();
    globalIndex.close();
    if ( posNumBits != null ) posNumBits.close();
    if ( isHighPerformance ) globalPositions.close();
   
    // We copy the relevant properties from the original
View Full Code Here

TOP

Related Classes of it.unimi.dsi.io.InputBitStream

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.