Package it.unimi.dsi.mg4j.index.payload

Examples of it.unimi.dsi.mg4j.index.payload.Payload


    /* This will be set if *all* indices to be merged agree. Moreover, if some
     * indices disagree we will emit a warning. */
    TermProcessor termProcessor = null;
    /* This will be set if *all* indices to be merged agree. Moreover, if some
     * indices disagree we will emit a warning. */
    Payload payload = null;
    String field = null;
    writeGlobCounts = writeSizes = true;
    boolean someGlobCounts = false, someSizes = false, allDataForSizeComputation = true;
   
    for( int i = 0; i < numIndices; i++ ) {
      index[ i ] = (BitStreamIndex)Index.getInstance( inputBasename[ i ], false, requireSizes, false );
      if ( i == 0 ) {
        termProcessor = index[ 0 ].termProcessor.copy();
        payload = index[ 0 ].payload == null ? null : index[ 0 ].payload.copy();
      }
      else {
        if ( ! termProcessor.equals( index[ i ].termProcessor ) ) throw new IllegalStateException( "The term processor of the first index (" + termProcessor + ") is different from the term processor of index " + i + " (" + index[ i ].termProcessor + ")" );
        if ( ( payload == null ) != ( index[ i ].payload == null ) || payload != null && ! payload.compatibleWith( index[ i ].payload ) ) throw new IllegalStateException( "The payload specification of index " + index[ 0 ] + " is not compatible with that of index " + index[ i ] );
      }

      if ( index[ i ].field != null ) {
        if ( field == null ) {
          if ( i != 0 ) LOGGER.warn( "Not all indices specify the field property" );
          field = index[ i ].field;
        }
        else if ( ! field.equals( index[ i ].field ) ) LOGGER.warn( "Index fields disagree: \"" + field + "\", \"" + index[ i ].field + "\"" );
      }


      haveCounts &= index[ i ].hasCounts;
      havePositions &= index[ i ].hasPositions;
      maxCount = Math.max( maxCount, index[ i ].maxCount );
      indexReader[ i ] = index[ i ].getReader( bufferSize );
      if ( index[ i ].properties.getLong( Index.PropertyKeys.OCCURRENCES, -1 ) == -1 ) numberOfOccurrences = -1;
      if ( numberOfOccurrences != -1 ) numberOfOccurrences += index[ i ].properties.getLong( Index.PropertyKeys.OCCURRENCES );
      final File globCountsFile = new File( this.inputBasename[ i ] + DiskBasedIndex.GLOBCOUNTS_EXTENSION );
      writeGlobCounts &= globCountsFile.exists();
      someGlobCounts |= globCountsFile.exists();
      if ( writeGlobCounts ) globCounts[ i ] = new InputBitStream( globCountsFile );

      if ( ! metadataOnly ) {
        final File offsetsFile = new File( this.inputBasename[ i ] + DiskBasedIndex.OFFSETS_EXTENSION );
        allDataForSizeComputation &= offsetsFile.exists();
        if ( quantum < 0 && allDataForSizeComputation ) offsets[ i ] = new InputBitStream( offsetsFile );

        if ( index[ i ].hasPositions ) {
          final File positionsLengthsFile = new File( this.inputBasename[ i ] + DiskBasedIndex.POSITIONS_NUMBER_OF_BITS_EXTENSION );
          allDataForSizeComputation &= positionsLengthsFile.exists();
          if ( quantum < 0 && allDataForSizeComputation ) posNumBits[ i ] = new InputBitStream( positionsLengthsFile );
        }
      }
     
      final File sizesFile = new File( this.inputBasename[ i ] + DiskBasedIndex.SIZES_EXTENSION );
      writeSizes &= sizesFile.exists();
      someSizes |= sizesFile.exists();

      term[ i ] = new MutableString();
      termReader[ i ] = new FastBufferedReader( new InputStreamReader( new FileInputStream( this.inputBasename[ i ] + DiskBasedIndex.TERMS_EXTENSION ), "UTF-8" ) );
      if ( termReader[ i ].readLine( term[ i ] ) != null ) termQueue.enqueue( i ); // If the term list is nonempty, we enqueue it
    }

    if ( writeGlobCounts != someGlobCounts ) LOGGER.warn"Some (but not all) global-counts file missing" );
    if ( writeSizes != someSizes ) LOGGER.warn"Some (but not all) sizes file missing" );
   
    additionalProperties = new Properties();
    additionalProperties.setProperty( Index.PropertyKeys.TERMPROCESSOR, ObjectParser.toSpec( termProcessor ) );
    if ( payload != null ) {
      additionalProperties.setProperty( Index.PropertyKeys.PAYLOADCLASS, payload.getClass().getName() );
      //writerFlags.put( Component.PAYLOADS, null );
    }
    additionalProperties.setProperty( Index.PropertyKeys.BATCHES, inputBasename.length );
    if ( field != null ) additionalProperties.setProperty( Index.PropertyKeys.FIELD, field );

View Full Code Here


    final Index index = curr.top();
    if ( ! index.hasPayloads ) throw new IllegalStateException( "Index " + index + " does not have payloads" );
    try {
      final Object parser = index2Parser.containsKey( index ) ? index2Parser.get( index ) : index.payload;
      final Method method = parser.getClass().getMethod( "parse", String.class );
      final Payload left = index.payload.copy(), right = index.payload.copy();
      if ( node.left != null ) left.set( method.invoke( parser, node.left.toString() ) );
      if ( node.right != null ) right.set( method.invoke( parser, node.right.toString() ) );
      return PayloadPredicateDocumentIterator.getInstance( index.documents( 0 ),
          index.payload.rangeFilter( node.left == null ? null : left, node.right == null ? null : right ) ).weight( weight() );
    }
    catch( InvocationTargetException e ) {
View Full Code Here

      indirect[ i ] = new InputBitStream( tempFile[ i ] );
      if ( bloomFilterPrecision != 0 ) bloomFilter[ i ] = new BloomFilter( globalIndex.numberOfTerms, bloomFilterPrecision );
    }
    int usedIndices;
    MutableString currentTerm = new MutableString();
    Payload payload = null;
    int frequency, globalPointer, localIndex, localPointer, count = -1;

    pl.expectedUpdates = globalIndex.numberOfPostings;
    pl.itemsName = "postings";
    pl.logInterval = logInterval;
    pl.start( "Partitioning index..." );

    for ( int t = 0; t < globalIndex.numberOfTerms; t++ ) {
      terms.readLine( currentTerm );
      indexIterator = indexReader.nextIterator();
      usedIndices = 0;
      frequency = indexIterator.frequency();
     
      for ( int j = 0; j < frequency; j++ ) {
        globalPointer = indexIterator.nextDocument();               
        localIndex = strategy.localIndex( globalPointer )

        if ( localFrequency[ localIndex ] == 0 ) {
          // First time we see a document for this index.
          currentTerm.println( localTerms[ localIndex ] );
          numTerms[ localIndex ]++;
          usedIndex[ usedIndices++ ] = localIndex;
          if ( bloomFilterPrecision != 0 ) bloomFilter[ localIndex ].add( currentTerm );
        }
       
        /* Store temporarily posting data; note that we save the global pointer as we
         * will have to access the size list. */
       
        localFrequency[ localIndex ]++;
        numPostings[ localIndex ]++;
        temp[ localIndex ].writeGamma( globalPointer );

        if ( globalIndex.hasPayloads ) payload = indexIterator.payload();
        if ( havePayloads ) payload.write( temp[ localIndex ] );
       
        if ( haveCounts ) {
          count = indexIterator.count();
          temp[ localIndex ].writeGamma( count );
          globCount[ localIndex ] += count;       
          if ( maxDocPos[ localIndex ] < count ) maxDocPos[ localIndex ] = count;        
          if ( havePositions ) {
            final int[] pos = indexIterator.positionArray();
            // TODO: compress this stuff
            for( int p = 0; p < count; p++ ) temp[ localIndex ].writeGamma( pos[ p ] );
          }
        }
      }
     
      // We now run through the indices used by this term and copy from the temporary buffer.

      OutputBitStream obs;
     
      for( int k = 0; k < usedIndices; k++ ) {
        final int i = usedIndex[ k ];

        localFrequencies[ i ].writeGamma( localFrequency[ i ] );
        if ( haveCounts ) numOccurrences[ i ] += globCount[ i ];
        if ( localGlobCounts[ i ] != null ) localGlobCounts[ i ].writeLongGamma( globCount[ i ] );
        globCount[ i ] = 0;
       
        InputBitStream ibs;
        indexWriter[ i ].newInvertedList();

        temp[ i ].align();
        if ( temp[ i ].buffer() != null ) ibs = direct[ i ];
        else {
          // We cannot read directly from the internal buffer.
          ibs = indirect[ i ];
          ibs.flush();
          temp[ i ].flush();
        }

        ibs.position( 0 );
         
        indexWriter[ i ].writeFrequency( localFrequency[ i ] );
        for( int j = 0; j < localFrequency[ i ]; j++ ) {
          obs = indexWriter[ i ].newDocumentRecord();
          globalPointer = ibs.readGamma();
          localPointer = strategy.localPointer( globalPointer )
          indexWriter[ i ].writeDocumentPointer( obs, localPointer );
          if ( havePayloads ) {
            payload.read( ibs );
            indexWriter[ i ].writePayload( obs, payload );
          }
          if ( haveCounts ) indexWriter[ i ].writePositionCount( obs, count = ibs.readGamma() );
          if ( havePositions ) {
            for( int p = 0; p < count; p++ ) position[ p ] = ibs.readGamma();
            indexWriter[ i ].writeDocumentPositions( obs, position, 0, count, sizeList != null ? sizeList.getInt( globalPointer ) : -1 );
          }
         
        }
        temp[ i ].position( 0 );
        temp[ i ].writtenBits( 0 );
        localFrequency[ i ] = 0;
      }
     
      usedIndices = 0;
      pl.count += frequency - 1;
      pl.update();
    }

    pl.done();

    Properties globalProperties = new Properties();
    globalProperties.setProperty( Index.PropertyKeys.FIELD, inputProperties.getProperty( Index.PropertyKeys.FIELD ) );
    globalProperties.setProperty( Index.PropertyKeys.TERMPROCESSOR, inputProperties.getProperty( Index.PropertyKeys.TERMPROCESSOR ) );
   
    for ( int i = 0; i < numIndices; i++ ) {
      localFrequencies[ i ].close();
      if ( localGlobCounts[ i ] != null ) localGlobCounts[ i ].close();
      localTerms[ i ].close();
      indexWriter[ i ].close();
      if ( bloomFilterPrecision != 0 ) BinIO.storeObject( bloomFilter[ i ], localBasename[ i ] + DocumentalCluster.BLOOM_EXTENSION );
      temp[ i ].close();
      tempFile[ i ].delete();
     
      Properties localProperties = indexWriter[ i ].properties();
      localProperties.addAll( globalProperties );
      localProperties.setProperty( Index.PropertyKeys.MAXCOUNT, String.valueOf( maxDocPos[ i ] ) );
      localProperties.setProperty( Index.PropertyKeys.MAXDOCSIZE, maxDocSize[ i ] );
      localProperties.setProperty( Index.PropertyKeys.FIELD, globalProperties.getProperty( Index.PropertyKeys.FIELD ) );
      localProperties.setProperty( Index.PropertyKeys.OCCURRENCES, haveCounts ? numOccurrences[ i ] : -1 );
      localProperties.setProperty( Index.PropertyKeys.POSTINGS, numPostings[ i ] );
      localProperties.setProperty( Index.PropertyKeys.TERMS, numTerms[ i ] );
      if ( havePayloads ) localProperties.setProperty( Index.PropertyKeys.PAYLOADCLASS, payload.getClass().getName() );
      if ( strategyProperties[ i ] != null ) localProperties.addAll( strategyProperties[ i ] );
      localProperties.save( localBasename[ i ] + DiskBasedIndex.PROPERTIES_EXTENSION );
    }

    if ( strategyFilename != null ) globalProperties.setProperty( IndexCluster.PropertyKeys.STRATEGY, strategyFilename );
    for( int i = 0; i < numIndices; i++ ) globalProperties.addProperty( IndexCluster.PropertyKeys.LOCALINDEX, localBasename[ i ] );
    globalProperties.setProperty( DocumentalCluster.PropertyKeys.BLOOM, bloomFilterPrecision != 0 );
    // If we partition an index with a single term, by definition we have a flat cluster
    globalProperties.setProperty( DocumentalCluster.PropertyKeys.FLAT, inputProperties.getInt( Index.PropertyKeys.TERMS ) <= 1 );
    globalProperties.setProperty( Index.PropertyKeys.MAXCOUNT, inputProperties.getProperty( Index.PropertyKeys.MAXCOUNT ) );
    globalProperties.setProperty( Index.PropertyKeys.MAXDOCSIZE, inputProperties.getProperty( Index.PropertyKeys.MAXDOCSIZE ) );
    globalProperties.setProperty( Index.PropertyKeys.POSTINGS, inputProperties.getProperty( Index.PropertyKeys.POSTINGS ) );
    globalProperties.setProperty( Index.PropertyKeys.OCCURRENCES, inputProperties.getProperty( Index.PropertyKeys.OCCURRENCES ) );
    globalProperties.setProperty( Index.PropertyKeys.DOCUMENTS, inputProperties.getProperty( Index.PropertyKeys.DOCUMENTS ) );
    globalProperties.setProperty( Index.PropertyKeys.TERMS, inputProperties.getProperty( Index.PropertyKeys.TERMS ) );
    if ( havePayloads ) globalProperties.setProperty( Index.PropertyKeys.PAYLOADCLASS, payload.getClass().getName() );

    /* For the general case, we must rely on a merged cluster. However, if we detect a contiguous
     * strategy we can optimise a bit. */
   
    globalProperties.setProperty( Index.PropertyKeys.INDEXCLASS,
View Full Code Here

    final String field = properties.getString( Index.PropertyKeys.FIELD, new File( basename.toString() ).getName() );

    if ( termMap != null && termMap.size() != numberOfTerms ) throw new IllegalArgumentException( "The size of the term map (" + termMap.size() + ") is not equal to the number of terms (" + numberOfTerms + ")" );
    if ( prefixMap != null && prefixMap.size() != numberOfTerms ) throw new IllegalArgumentException( "The size of the prefix map (" + prefixMap.size() + ") is not equal to the number of terms (" + numberOfTerms + ")" );

    final Payload payload = (Payload)( properties.containsKey( Index.PropertyKeys.PAYLOADCLASS ) ? Class.forName( properties.getString( Index.PropertyKeys.PAYLOADCLASS ) ).newInstance() : null );
    final Coding frequencyCoding = flags.get( Component.FREQUENCIES );
    final Coding pointerCoding = flags.get( Component.POINTERS );
    final Coding countCoding = flags.get( Component.COUNTS );
    final Coding positionCoding = flags.get( Component.POSITIONS );
   
View Full Code Here

    if ( sizes != null && documentSizes ) LOGGER.warn( "You are loading both local sizes and a global size file specified by the \"size\" properties, which is usually nonsensical" );

    boolean hasCounts = true;
    boolean hasPositions = true;
    Payload payload = null;
   
    for ( int i = 0; i < localIndex.length; i++ ) {
      hasCounts = hasCounts && localIndex[ i ].hasCounts;
      hasPositions = hasPositions && localIndex[ i ].hasPositions;

      if ( i == 0 ) payload = localIndex[ i ].payload;
      if ( ( payload == null ) != ( localIndex[ i ].payload == null ) || payload != null && ! payload.compatibleWith( localIndex[ i ].payload ) ) throw new IllegalStateException( "The payload specification of index " + localIndex[ 0 ] + " is not compatible with that of index " + localIndex[ i ] );
    }

    // We stem the names of Bloom filters from the index basename.
    BloomFilter[] termFilter = null;
    if ( properties.getBoolean( DocumentalCluster.PropertyKeys.BLOOM ) ) {
View Full Code Here

              // TODO: write tests for the other case
              if ( allBitStreamIndices ) {
                IndexIterator indexIterator = indexReader[ i ].documents( 0 );
                int pointer = indexIterator.skipTo( currDoc );
                if ( pointer == currDoc ) {
                  Payload payload = indexIterator.payload();
                  if ( ! payload.get().equals( content ) ) LOGGER.error( index[ i ] + ": Document " + currDoc + " has payload " + content + " but the index says " + payload )
                }
                else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
              }
              else {
                IndexIterator indexIterator = indexReader[ i ].documents);
                if ( indexIterator.skipTo( currDoc ) == currDoc ) {
                  if ( ! indexIterator.payload().get().equals( content ) )
                    LOGGER.error( index[ i ] + ": Document " + currDoc + " has payload " + content + " but the index says " + indexIterator.payload().get() );
                }
                else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
              }
            }
            else {
              // text index
              pos = 0;
              termsInDoc[ i ].clear();
              reader = (Reader)content;
              wordReader = document.wordReader( stem || index[ i ].field == null ? indexedField[ i ] : factory.fieldIndex( index[ i ].field ) );
              wordReader.setReader( reader );
              while( wordReader.next( word, nonWord ) ) {
                if ( word.length() == 0 || index[ i ].termProcessor != null && ! index[ i ].termProcessor.processTerm( word ) ) continue;
                if ( ( t = (int)( (BitStreamIndex)index[ i ] ).termMap.getLong( word ) ) == -1 ) LOGGER.error( index[ i ] + ": Could not find term " + word + " in term index" );
                else {
                  if ( index[ i ].hasCounts ) termsInDoc[ i ].put( t, termsInDoc[ i ].get( t ) + 1 );
                  if ( index[ i ].hasPositions ) wordInPos[ i ][ pos++ ] = t;
                }
              }

              if ( allBitStreamIndices ) {
                for( IntIterator x = termsInDoc[ i ].keySet().iterator(); x.hasNext(); ) {
                  t = x.nextInt();

                  IndexIterator indexIterator = indexReader[ i ].documents( t );

                  int pointer = indexIterator.skipTo( currDoc );
                  if ( pointer == currDoc ) {
                    if ( index[ i ].hasCounts ) {
                      int c = indexIterator.count();
                      if ( termsInDoc[ i ].get( t ) !=  c )
                        LOGGER.error( index[ i ] + ": The count for term " + t + " in document " + currDoc + " is " + c + " instead of " + termsInDoc[ i ].get( t ) );
                      else {
                        if ( index[ i ].hasPositions ) {
                          indexIterator.positions( occ[ i ] );

                          for( int j = 0; j < c; j++ )
                            if ( wordInPos[ i ][ occ[ i ][ j ] ] != t
                              LOGGER.error( index[ i ] + ": The occurrence of index " + i + " of term " + t + " (position " + occ[ i ] +") in document " + currDoc + " is occupied instead by term " + wordInPos[ i ][ occ[ i ][ j ] ] );
                        }
                      }
                    }
                  }
                  else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t + "(skipTo returned " + pointer + ")" );
                }
              }
              else {
                for( IntIterator x = termsInDoc[ i ].keySet().iterator(); x.hasNext(); ) {
                  t = x.nextInt();
                  IndexIterator indexIterator = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );

                  if ( indexIterator.skipTo( currDoc ) == currDoc ) {
                    if ( index[ i ].hasCounts ) {
                      int c = indexIterator.count();
                      if ( termsInDoc[ i ].get( t ) !=  c )
                        LOGGER.error( index[ i ] + ": The count for term " + t + " in document " + currDoc + " is " + c + " instead of " + termsInDoc[ i ].get( t ) );
                      else {
                        if ( index[ i ].hasPositions ) {
                          indexIterator.positions( occ[ i ] );

                          for( int j = 0; j < c; j++ )
                            if ( wordInPos[ i ][ occ[ i ][ j ] ] != t
                              LOGGER.error( index[ i ] + ": The occurrence of index " + i + " of term " + t + " (position " + occ[ i ] +") in document " + currDoc + " is occupied instead by term " + wordInPos[ i ][ occ[ i ][ j ] ] );
                        }
                      }
                    }
                  }
                  else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
                }
              }
            }
          }
          docCounter++;
          document.close();
          pl.update();
        }
      }
      else {
        LOGGER.warn( "Random access tests require very slow single-term scanning as not all indices are disk based" );

        it.unimi.dsi.mg4j.document.DocumentIterator documentIterator = documentSequence.iterator();
        Document document;
        Reader reader;
        WordReader wordReader;
       
        final MutableString word = new MutableString(), nonWord = new MutableString();
       
        int docCounter = 0;
       
        while( ( document = documentIterator.nextDocument() ) != null ) {
          currDoc = permutation != null ? permutation[ docCounter ] : docCounter;

          for( i = 0; i < index.length; i++ ) {
            Object content = document.content( stem || index[ i ].field == null ? indexedField[ i ] : factory.fieldIndex( index[ i ].field ) );
            if ( index[ i ].hasPayloads ) {
              if ( allBitStreamIndices ) {
                IndexIterator indexIterator = indexReader[ i ].documents( 0 );
                int pointer = indexIterator.skipTo( currDoc );
                if ( pointer == currDoc ) {
                  Payload payload = indexIterator.payload();
                  if ( ! payload.get().equals( content ) ) LOGGER.error( index[ i ] + ": Document " + currDoc + " has payload " + content + " but the index says " + payload )
                }
                else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
              }
              else {
                IndexIterator indexIterator = indexReader[ i ].documents( "#" );
View Full Code Here

TOP

Related Classes of it.unimi.dsi.mg4j.index.payload.Payload

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.