Examples of it.unimi.dsi.mg4j.index.payload.Payload

it.unimi.dsi.mg4j.index.payload.Payload
An index payload.
The main responsibility of this class is that of providing efficient ways to read and write a payload from and to bit streams. An instance of this class has at any given time a current value, which is set when {@linkplain #read(InputBitStream) reading}. and output when {@linkplain #write(OutputBitStream) writing}.
The current value can be modified using {@link #set(Object)}, and each implementation must document thoroughly which objects are accepted by this method.
It is expected that in most implementations reading and writing is much more efficient than reading, {@linkplain #get() getting a value}, {@linkplain #set(Object) setting that value} in another instance, andfinally {@linkplain #write(OutputBitStream) writing}.
Implementation of a payload might have parameters. If you need to know whether two instances are compatible, in the sense that each instance can read correctly data written by the other one, you can invoke the {@link #compatibleWith(Payload)} method.
Optionally, implementations can feature a parse(String) method that returns an object of the correct type for {@link #set(Object)}. This method can be used (for instance, by reflection) to try to build a payload from a string specification (this is what happens in {@link DocumentIteratorBuilderVisitor}).

    /* This will be set if *all* indices to be merged agree. Moreover, if some
     * indices disagree we will emit a warning. */
    TermProcessor termProcessor = null;
    /* This will be set if *all* indices to be merged agree. Moreover, if some
     * indices disagree we will emit a warning. */
    Payload payload = null;
    String field = null;
    writeGlobCounts = writeSizes = true;
    boolean someGlobCounts = false, someSizes = false, allDataForSizeComputation = true;
    
    for( int i = 0; i < numIndices; i++ ) {
      index[ i ] = (BitStreamIndex)Index.getInstance( inputBasename[ i ], false, requireSizes, false );
      if ( i == 0 ) {
        termProcessor = index[ 0 ].termProcessor.copy();
        payload = index[ 0 ].payload == null ? null : index[ 0 ].payload.copy();
      }
      else {
        if ( ! termProcessor.equals( index[ i ].termProcessor ) ) throw new IllegalStateException( "The term processor of the first index (" + termProcessor + ") is different from the term processor of index " + i + " (" + index[ i ].termProcessor + ")" );
        if ( ( payload == null ) != ( index[ i ].payload == null ) || payload != null && ! payload.compatibleWith( index[ i ].payload ) ) throw new IllegalStateException( "The payload specification of index " + index[ 0 ] + " is not compatible with that of index " + index[ i ] );
      }


      if ( index[ i ].field != null ) {
        if ( field == null ) {
          if ( i != 0 ) LOGGER.warn( "Not all indices specify the field property" );
          field = index[ i ].field;
        }
        else if ( ! field.equals( index[ i ].field ) ) LOGGER.warn( "Index fields disagree: \"" + field + "\", \"" + index[ i ].field + "\"" );
      }




      haveCounts &= index[ i ].hasCounts;
      havePositions &= index[ i ].hasPositions;
      maxCount = Math.max( maxCount, index[ i ].maxCount );
      indexReader[ i ] = index[ i ].getReader( bufferSize );
      if ( index[ i ].properties.getLong( Index.PropertyKeys.OCCURRENCES, -1 ) == -1 ) numberOfOccurrences = -1;
      if ( numberOfOccurrences != -1 ) numberOfOccurrences += index[ i ].properties.getLong( Index.PropertyKeys.OCCURRENCES );
      final File globCountsFile = new File( this.inputBasename[ i ] + DiskBasedIndex.GLOBCOUNTS_EXTENSION );
      writeGlobCounts &= globCountsFile.exists();
      someGlobCounts |= globCountsFile.exists();
      if ( writeGlobCounts ) globCounts[ i ] = new InputBitStream( globCountsFile );


      if ( ! metadataOnly ) {
        final File offsetsFile = new File( this.inputBasename[ i ] + DiskBasedIndex.OFFSETS_EXTENSION );
        allDataForSizeComputation &= offsetsFile.exists();
        if ( quantum < 0 && allDataForSizeComputation ) offsets[ i ] = new InputBitStream( offsetsFile );


        if ( index[ i ].hasPositions ) {
          final File positionsLengthsFile = new File( this.inputBasename[ i ] + DiskBasedIndex.POSITIONS_NUMBER_OF_BITS_EXTENSION );
          allDataForSizeComputation &= positionsLengthsFile.exists();
          if ( quantum < 0 && allDataForSizeComputation ) posNumBits[ i ] = new InputBitStream( positionsLengthsFile );
        }
      }
      
      final File sizesFile = new File( this.inputBasename[ i ] + DiskBasedIndex.SIZES_EXTENSION );
      writeSizes &= sizesFile.exists();
      someSizes |= sizesFile.exists();


      term[ i ] = new MutableString();
      termReader[ i ] = new FastBufferedReader( new InputStreamReader( new FileInputStream( this.inputBasename[ i ] + DiskBasedIndex.TERMS_EXTENSION ), "UTF-8" ) );
      if ( termReader[ i ].readLine( term[ i ] ) != null ) termQueue.enqueue( i ); // If the term list is nonempty, we enqueue it
    }


    if ( writeGlobCounts != someGlobCounts ) LOGGER.warn(  "Some (but not all) global-counts file missing" );
    if ( writeSizes != someSizes ) LOGGER.warn(  "Some (but not all) sizes file missing" );
    
    additionalProperties = new Properties();
    additionalProperties.setProperty( Index.PropertyKeys.TERMPROCESSOR, ObjectParser.toSpec( termProcessor ) );
    if ( payload != null ) {
      additionalProperties.setProperty( Index.PropertyKeys.PAYLOADCLASS, payload.getClass().getName() );
      //writerFlags.put( Component.PAYLOADS, null );
    }
    additionalProperties.setProperty( Index.PropertyKeys.BATCHES, inputBasename.length );
    if ( field != null ) additionalProperties.setProperty( Index.PropertyKeys.FIELD, field );

View Full Code Here

    final Index index = curr.top();
    if ( ! index.hasPayloads ) throw new IllegalStateException( "Index " + index + " does not have payloads" );
    try {
      final Object parser = index2Parser.containsKey( index ) ? index2Parser.get( index ) : index.payload;
      final Method method = parser.getClass().getMethod( "parse", String.class );
      final Payload left = index.payload.copy(), right = index.payload.copy();
      if ( node.left != null ) left.set( method.invoke( parser, node.left.toString() ) );
      if ( node.right != null ) right.set( method.invoke( parser, node.right.toString() ) );
      return PayloadPredicateDocumentIterator.getInstance( index.documents( 0 ), 
          index.payload.rangeFilter( node.left == null ? null : left, node.right == null ? null : right ) ).weight( weight() );
    }
    catch( InvocationTargetException e ) {

View Full Code Here

      indirect[ i ] = new InputBitStream( tempFile[ i ] );
      if ( bloomFilterPrecision != 0 ) bloomFilter[ i ] = new BloomFilter( globalIndex.numberOfTerms, bloomFilterPrecision );
    }
    int usedIndices;
    MutableString currentTerm = new MutableString();
    Payload payload = null;
    int frequency, globalPointer, localIndex, localPointer, count = -1;


    pl.expectedUpdates = globalIndex.numberOfPostings;
    pl.itemsName = "postings";
    pl.logInterval = logInterval;
    pl.start( "Partitioning index..." );


    for ( int t = 0; t < globalIndex.numberOfTerms; t++ ) {
      terms.readLine( currentTerm );
      indexIterator = indexReader.nextIterator();
      usedIndices = 0;
      frequency = indexIterator.frequency();
      
      for ( int j = 0; j < frequency; j++ ) {
        globalPointer = indexIterator.nextDocument();                
        localIndex = strategy.localIndex( globalPointer );  


        if ( localFrequency[ localIndex ] == 0 ) {
          // First time we see a document for this index.
          currentTerm.println( localTerms[ localIndex ] );
          numTerms[ localIndex ]++;
          usedIndex[ usedIndices++ ] = localIndex;
          if ( bloomFilterPrecision != 0 ) bloomFilter[ localIndex ].add( currentTerm );
        }
        
        /* Store temporarily posting data; note that we save the global pointer as we
         * will have to access the size list. */
        
        localFrequency[ localIndex ]++;
        numPostings[ localIndex ]++;
        temp[ localIndex ].writeGamma( globalPointer );


        if ( globalIndex.hasPayloads ) payload = indexIterator.payload();
        if ( havePayloads ) payload.write( temp[ localIndex ] );
        
        if ( haveCounts ) {
          count = indexIterator.count();
          temp[ localIndex ].writeGamma( count );
          globCount[ localIndex ] += count;        
          if ( maxDocPos[ localIndex ] < count ) maxDocPos[ localIndex ] = count;         
          if ( havePositions ) {
            final int[] pos = indexIterator.positionArray();
            // TODO: compress this stuff
            for( int p = 0; p < count; p++ ) temp[ localIndex ].writeGamma( pos[ p ] ); 
          }
        }
      }
      
      // We now run through the indices used by this term and copy from the temporary buffer.


      OutputBitStream obs;
      
      for( int k = 0; k < usedIndices; k++ ) {
        final int i = usedIndex[ k ];


        localFrequencies[ i ].writeGamma( localFrequency[ i ] );
        if ( haveCounts ) numOccurrences[ i ] += globCount[ i ];
        if ( localGlobCounts[ i ] != null ) localGlobCounts[ i ].writeLongGamma( globCount[ i ] );
        globCount[ i ] = 0;
        
        InputBitStream ibs;
        indexWriter[ i ].newInvertedList();


        temp[ i ].align();
        if ( temp[ i ].buffer() != null ) ibs = direct[ i ];
        else {
          // We cannot read directly from the internal buffer.
          ibs = indirect[ i ];
          ibs.flush();
          temp[ i ].flush();
        }


        ibs.position( 0 );
          
        indexWriter[ i ].writeFrequency( localFrequency[ i ] );
        for( int j = 0; j < localFrequency[ i ]; j++ ) {
          obs = indexWriter[ i ].newDocumentRecord();
          globalPointer = ibs.readGamma();
          localPointer = strategy.localPointer( globalPointer );  
          indexWriter[ i ].writeDocumentPointer( obs, localPointer );
          if ( havePayloads ) {
            payload.read( ibs );
            indexWriter[ i ].writePayload( obs, payload );
          }
          if ( haveCounts ) indexWriter[ i ].writePositionCount( obs, count = ibs.readGamma() );
          if ( havePositions ) {
            for( int p = 0; p < count; p++ ) position[ p ] = ibs.readGamma();
            indexWriter[ i ].writeDocumentPositions( obs, position, 0, count, sizeList != null ? sizeList.getInt( globalPointer ) : -1 );
          }
          
        }
        temp[ i ].position( 0 );
        temp[ i ].writtenBits( 0 );
        localFrequency[ i ] = 0;
      }
      
      usedIndices = 0;
      pl.count += frequency - 1;
      pl.update();
    }


    pl.done();


    Properties globalProperties = new Properties();
    globalProperties.setProperty( Index.PropertyKeys.FIELD, inputProperties.getProperty( Index.PropertyKeys.FIELD ) );
    globalProperties.setProperty( Index.PropertyKeys.TERMPROCESSOR, inputProperties.getProperty( Index.PropertyKeys.TERMPROCESSOR ) );
    
    for ( int i = 0; i < numIndices; i++ ) {
      localFrequencies[ i ].close();
      if ( localGlobCounts[ i ] != null ) localGlobCounts[ i ].close();
      localTerms[ i ].close(); 
      indexWriter[ i ].close();
      if ( bloomFilterPrecision != 0 ) BinIO.storeObject( bloomFilter[ i ], localBasename[ i ] + DocumentalCluster.BLOOM_EXTENSION );
      temp[ i ].close();
      tempFile[ i ].delete();
      
      Properties localProperties = indexWriter[ i ].properties();
      localProperties.addAll( globalProperties );
      localProperties.setProperty( Index.PropertyKeys.MAXCOUNT, String.valueOf( maxDocPos[ i ] ) );
      localProperties.setProperty( Index.PropertyKeys.MAXDOCSIZE, maxDocSize[ i ] );
      localProperties.setProperty( Index.PropertyKeys.FIELD, globalProperties.getProperty( Index.PropertyKeys.FIELD ) );
      localProperties.setProperty( Index.PropertyKeys.OCCURRENCES, haveCounts ? numOccurrences[ i ] : -1 );
      localProperties.setProperty( Index.PropertyKeys.POSTINGS, numPostings[ i ] );
      localProperties.setProperty( Index.PropertyKeys.TERMS, numTerms[ i ] );
      if ( havePayloads ) localProperties.setProperty( Index.PropertyKeys.PAYLOADCLASS, payload.getClass().getName() );
      if ( strategyProperties[ i ] != null ) localProperties.addAll( strategyProperties[ i ] );
      localProperties.save( localBasename[ i ] + DiskBasedIndex.PROPERTIES_EXTENSION );
    }


    if ( strategyFilename != null ) globalProperties.setProperty( IndexCluster.PropertyKeys.STRATEGY, strategyFilename );
    for( int i = 0; i < numIndices; i++ ) globalProperties.addProperty( IndexCluster.PropertyKeys.LOCALINDEX, localBasename[ i ] );
    globalProperties.setProperty( DocumentalCluster.PropertyKeys.BLOOM, bloomFilterPrecision != 0 );
    // If we partition an index with a single term, by definition we have a flat cluster
    globalProperties.setProperty( DocumentalCluster.PropertyKeys.FLAT, inputProperties.getInt( Index.PropertyKeys.TERMS ) <= 1 );
    globalProperties.setProperty( Index.PropertyKeys.MAXCOUNT, inputProperties.getProperty( Index.PropertyKeys.MAXCOUNT ) );
    globalProperties.setProperty( Index.PropertyKeys.MAXDOCSIZE, inputProperties.getProperty( Index.PropertyKeys.MAXDOCSIZE ) );
    globalProperties.setProperty( Index.PropertyKeys.POSTINGS, inputProperties.getProperty( Index.PropertyKeys.POSTINGS ) );
    globalProperties.setProperty( Index.PropertyKeys.OCCURRENCES, inputProperties.getProperty( Index.PropertyKeys.OCCURRENCES ) );
    globalProperties.setProperty( Index.PropertyKeys.DOCUMENTS, inputProperties.getProperty( Index.PropertyKeys.DOCUMENTS ) );
    globalProperties.setProperty( Index.PropertyKeys.TERMS, inputProperties.getProperty( Index.PropertyKeys.TERMS ) );
    if ( havePayloads ) globalProperties.setProperty( Index.PropertyKeys.PAYLOADCLASS, payload.getClass().getName() );


    /* For the general case, we must rely on a merged cluster. However, if we detect a contiguous
     * strategy we can optimise a bit. */
    
    globalProperties.setProperty( Index.PropertyKeys.INDEXCLASS,

View Full Code Here

    final String field = properties.getString( Index.PropertyKeys.FIELD, new File( basename.toString() ).getName() );


    if ( termMap != null && termMap.size() != numberOfTerms ) throw new IllegalArgumentException( "The size of the term map (" + termMap.size() + ") is not equal to the number of terms (" + numberOfTerms + ")" );
    if ( prefixMap != null && prefixMap.size() != numberOfTerms ) throw new IllegalArgumentException( "The size of the prefix map (" + prefixMap.size() + ") is not equal to the number of terms (" + numberOfTerms + ")" );


    final Payload payload = (Payload)( properties.containsKey( Index.PropertyKeys.PAYLOADCLASS ) ? Class.forName( properties.getString( Index.PropertyKeys.PAYLOADCLASS ) ).newInstance() : null );
    final Coding frequencyCoding = flags.get( Component.FREQUENCIES );
    final Coding pointerCoding = flags.get( Component.POINTERS );
    final Coding countCoding = flags.get( Component.COUNTS );
    final Coding positionCoding = flags.get( Component.POSITIONS );

View Full Code Here


    if ( sizes != null && documentSizes ) LOGGER.warn( "You are loading both local sizes and a global size file specified by the \"size\" properties, which is usually nonsensical" );


    boolean hasCounts = true;
    boolean hasPositions = true;
    Payload payload = null;
    
    for ( int i = 0; i < localIndex.length; i++ ) {
      hasCounts = hasCounts && localIndex[ i ].hasCounts;
      hasPositions = hasPositions && localIndex[ i ].hasPositions;


      if ( i == 0 ) payload = localIndex[ i ].payload;
      if ( ( payload == null ) != ( localIndex[ i ].payload == null ) || payload != null && ! payload.compatibleWith( localIndex[ i ].payload ) ) throw new IllegalStateException( "The payload specification of index " + localIndex[ 0 ] + " is not compatible with that of index " + localIndex[ i ] );
    }


    // We stem the names of Bloom filters from the index basename.
    BloomFilter[] termFilter = null;
    if ( properties.getBoolean( DocumentalCluster.PropertyKeys.BLOOM ) ) {

View Full Code Here

              // TODO: write tests for the other case
              if ( allBitStreamIndices ) {
                IndexIterator indexIterator = indexReader[ i ].documents( 0 );
                int pointer = indexIterator.skipTo( currDoc );
                if ( pointer == currDoc ) {
                  Payload payload = indexIterator.payload();
                  if ( ! payload.get().equals( content ) ) LOGGER.error( index[ i ] + ": Document " + currDoc + " has payload " + content + " but the index says " + payload );  
                }
                else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
              }
              else {
                IndexIterator indexIterator = indexReader[ i ].documents(  0  );
                if ( indexIterator.skipTo( currDoc ) == currDoc ) {
                  if ( ! indexIterator.payload().get().equals( content ) )
                    LOGGER.error( index[ i ] + ": Document " + currDoc + " has payload " + content + " but the index says " + indexIterator.payload().get() );
                } 
                else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
              }
            }
            else {
              // text index
              pos = 0;
              termsInDoc[ i ].clear();
              reader = (Reader)content;
              wordReader = document.wordReader( stem || index[ i ].field == null ? indexedField[ i ] : factory.fieldIndex( index[ i ].field ) );
              wordReader.setReader( reader );
              while( wordReader.next( word, nonWord ) ) {
                if ( word.length() == 0 || index[ i ].termProcessor != null && ! index[ i ].termProcessor.processTerm( word ) ) continue;
                if ( ( t = (int)( (BitStreamIndex)index[ i ] ).termMap.getLong( word ) ) == -1 ) LOGGER.error( index[ i ] + ": Could not find term " + word + " in term index" );
                else {
                  if ( index[ i ].hasCounts ) termsInDoc[ i ].put( t, termsInDoc[ i ].get( t ) + 1 );
                  if ( index[ i ].hasPositions ) wordInPos[ i ][ pos++ ] = t;
                }
              }


              if ( allBitStreamIndices ) {
                for( IntIterator x = termsInDoc[ i ].keySet().iterator(); x.hasNext(); ) {
                  t = x.nextInt();


                  IndexIterator indexIterator = indexReader[ i ].documents( t );


                  int pointer = indexIterator.skipTo( currDoc );
                  if ( pointer == currDoc ) {
                    if ( index[ i ].hasCounts ) {
                      int c = indexIterator.count();
                      if ( termsInDoc[ i ].get( t ) !=  c ) 
                        LOGGER.error( index[ i ] + ": The count for term " + t + " in document " + currDoc + " is " + c + " instead of " + termsInDoc[ i ].get( t ) );
                      else {
                        if ( index[ i ].hasPositions ) {
                          indexIterator.positions( occ[ i ] );


                          for( int j = 0; j < c; j++ ) 
                            if ( wordInPos[ i ][ occ[ i ][ j ] ] != t )  
                              LOGGER.error( index[ i ] + ": The occurrence of index " + i + " of term " + t + " (position " + occ[ i ] +") in document " + currDoc + " is occupied instead by term " + wordInPos[ i ][ occ[ i ][ j ] ] );
                        }
                      }
                    } 
                  }
                  else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t + "(skipTo returned " + pointer + ")" );
                }
              }
              else {
                for( IntIterator x = termsInDoc[ i ].keySet().iterator(); x.hasNext(); ) {
                  t = x.nextInt();
                  IndexIterator indexIterator = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );


                  if ( indexIterator.skipTo( currDoc ) == currDoc ) {
                    if ( index[ i ].hasCounts ) {
                      int c = indexIterator.count();
                      if ( termsInDoc[ i ].get( t ) !=  c ) 
                        LOGGER.error( index[ i ] + ": The count for term " + t + " in document " + currDoc + " is " + c + " instead of " + termsInDoc[ i ].get( t ) );
                      else {
                        if ( index[ i ].hasPositions ) {
                          indexIterator.positions( occ[ i ] );


                          for( int j = 0; j < c; j++ ) 
                            if ( wordInPos[ i ][ occ[ i ][ j ] ] != t )  
                              LOGGER.error( index[ i ] + ": The occurrence of index " + i + " of term " + t + " (position " + occ[ i ] +") in document " + currDoc + " is occupied instead by term " + wordInPos[ i ][ occ[ i ][ j ] ] );
                        }
                      }
                    }
                  } 
                  else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
                }
              }
            }
          }
          docCounter++;
          document.close();
          pl.update();
        }
      }
      else {
        LOGGER.warn( "Random access tests require very slow single-term scanning as not all indices are disk based" );


        it.unimi.dsi.mg4j.document.DocumentIterator documentIterator = documentSequence.iterator();
        Document document;
        Reader reader;
        WordReader wordReader;
        
        final MutableString word = new MutableString(), nonWord = new MutableString();
        
        int docCounter = 0;
        
        while( ( document = documentIterator.nextDocument() ) != null ) {
          currDoc = permutation != null ? permutation[ docCounter ] : docCounter;


          for( i = 0; i < index.length; i++ ) {
            Object content = document.content( stem || index[ i ].field == null ? indexedField[ i ] : factory.fieldIndex( index[ i ].field ) );
            if ( index[ i ].hasPayloads ) {
              if ( allBitStreamIndices ) {
                IndexIterator indexIterator = indexReader[ i ].documents( 0 );
                int pointer = indexIterator.skipTo( currDoc );
                if ( pointer == currDoc ) {
                  Payload payload = indexIterator.payload();
                  if ( ! payload.get().equals( content ) ) LOGGER.error( index[ i ] + ": Document " + currDoc + " has payload " + content + " but the index says " + payload );  
                }
                else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
              }
              else {
                IndexIterator indexIterator = indexReader[ i ].documents( "#" );

View Full Code Here

TOP

Related Classes of it.unimi.dsi.mg4j.index.payload.Payload

it.unimi.dsi.mg4j.index.cluster.IndexCluster

it.unimi.dsi.mg4j.index.DiskBasedIndex

it.unimi.dsi.mg4j.search.DocumentIteratorBuilderVisitor

it.unimi.dsi.mg4j.test.Verifier

it.unimi.dsi.mg4j.tool.Combine

it.unimi.dsi.mg4j.tool.PartitionDocumentally

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.