Package it.unimi.dsi.io

Examples of it.unimi.dsi.io.WordReader


      for ( int f = 0; f < seq.factory().numberOfFields(); f++ ) {
        System.out.println( "** Field # " + f + ", " + seq.factory().fieldName( f ) );
        Object field = document.content( f );
        if ( seq.factory().fieldType( f ) == FieldType.TEXT ) {
          Reader reader = (Reader)field;
          WordReader wr = document.wordReader( f );
          wr.setReader( reader );
          MutableString word = new MutableString();
          MutableString nonWord = new MutableString();
            while ( wr.next( word, nonWord ) ) System.out.println( word.toString() + nonWord.toString() );
        } else System.out.println( field );
      }
      doc++;
    }
  }
View Full Code Here


            throw new RuntimeException( e );
          }
          //logger.debug( "Asked for a new word reader for field " + fieldName( field ) );
          switch ( fieldType( field ) ) {
          case TEXT:
            return new WordReader() {
              private static final long serialVersionUID = 1L;
              public boolean next( final MutableString word, final MutableString nonWord ) throws IOException {
                try {
                  word.readSelfDelimUTF8( rawContent );
                }
View Full Code Here

    pl.displayFreeMemory = true;
    pl.start( "Indexing documents..." );

    DocumentIterator iterator = documentSequence.iterator();
    Reader reader;
    WordReader wordReader;
    ObjectList<VirtualDocumentFragment> fragments;
    Document document;

    int documentPointer = 0, documentsInBatch = 0;
    long batchStartTime = System.currentTimeMillis();
    boolean outOfMemoryError = false;

    while ( ( document = iterator.nextDocument() ) != null ) {
     
      long overallTerms = 0;
      if ( building ) builder.startDocument( document.title(), document.uri() );
      for ( int i = 0; i < numberOfIndexedFields; i++ ) {
        switch ( factory.fieldType( indexedField[ i ] ) ) {
        case TEXT:
          reader = (Reader)document.content( indexedField[ i ] );
          wordReader = document.wordReader( indexedField[ i ] );
          wordReader.setReader( reader );
          if ( building ) builder.startTextField();
          scan[ i ].processDocument( map != null ? map[ documentPointer ] : documentPointer, wordReader );
          if ( building ) builder.endTextField();
          overallTerms += scan[ i ].numTerms;
          break;
        case VIRTUAL:
          fragments = (ObjectList<VirtualDocumentFragment>)document.content( indexedField[ i ] );
          wordReader = document.wordReader( indexedField[ i ] );
          virtualDocumentResolver[ i ].context( document );
          for( VirtualDocumentFragment fragment: fragments ) {
            int virtualDocumentPointer = virtualDocumentResolver[ i ].resolve( fragment.documentSpecifier() );
            if ( virtualDocumentPointer < 0 ) continue;
            if ( map != null ) virtualDocumentPointer = map[ virtualDocumentPointer ];
            wordReader.setReader( new FastBufferedReader( fragment.text() ) );
            scan[ i ].processDocument( virtualDocumentPointer, wordReader );
          }
          if ( building ) builder.virtualField( fragments );
          overallTerms += scan[ i ].numTerms;
          break;
View Full Code Here

  @SuppressWarnings("unchecked")
  public void build( final DocumentSequence inputSequence ) throws IOException {
    final DocumentIterator docIt = inputSequence.iterator();
    if ( factory != inputSequence.factory() ) throw new IllegalStateException( "The factory provided by the constructor does not correspond to the factory of the input sequence" );
    final int numberOfFields = factory.numberOfFields();
    WordReader wordReader;
    MutableString word = new MutableString();
    MutableString nonWord = new MutableString();
   
    open( "" );
    for (;;) {
      Document document = docIt.nextDocument();
      if ( document == null ) break;
      startDocument( document.title(), document.uri() );
     
      for ( int field = 0; field < numberOfFields; field++ ) {
        Object content = document.content( field );
        if ( factory.fieldType( field ) == FieldType.TEXT ) {
          startTextField();
          wordReader = document.wordReader( field );
          wordReader.setReader( (Reader)content );
          while ( wordReader.next( word, nonWord ) ) add( word, nonWord );
          endTextField();
        }
        else if ( factory.fieldType( field ) == FieldType.VIRTUAL ) virtualField( (ObjectList<VirtualDocumentFragment>)content );
        else nonTextField( content );
      }
View Full Code Here

      if ( allBitStreamIndices ) {
        it.unimi.dsi.mg4j.document.DocumentIterator documentIterator = documentSequence.iterator();
        Document document;
        Reader reader;
        WordReader wordReader;
       
        final MutableString word = new MutableString(), nonWord = new MutableString();
       
        int docCounter = 0;
       
        while( ( document = documentIterator.nextDocument() ) != null ) {
          currDoc = permutation != null ? permutation[ docCounter ] : docCounter;

          for( i = 0; i < index.length; i++ ) {
            Object content = document.content( stem || index[ i ].field == null ? indexedField[ i ] : factory.fieldIndex( index[ i ].field ) );
            if ( index[ i ].hasPayloads ) {
              // TODO: write tests for the other case
              if ( allBitStreamIndices ) {
                IndexIterator indexIterator = indexReader[ i ].documents( 0 );
                int pointer = indexIterator.skipTo( currDoc );
                if ( pointer == currDoc ) {
                  Payload payload = indexIterator.payload();
                  if ( ! payload.get().equals( content ) ) LOGGER.error( index[ i ] + ": Document " + currDoc + " has payload " + content + " but the index says " + payload )
                }
                else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
              }
              else {
                IndexIterator indexIterator = indexReader[ i ].documents);
                if ( indexIterator.skipTo( currDoc ) == currDoc ) {
                  if ( ! indexIterator.payload().get().equals( content ) )
                    LOGGER.error( index[ i ] + ": Document " + currDoc + " has payload " + content + " but the index says " + indexIterator.payload().get() );
                }
                else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
              }
            }
            else {
              // text index
              pos = 0;
              termsInDoc[ i ].clear();
              reader = (Reader)content;
              wordReader = document.wordReader( stem || index[ i ].field == null ? indexedField[ i ] : factory.fieldIndex( index[ i ].field ) );
              wordReader.setReader( reader );
              while( wordReader.next( word, nonWord ) ) {
                if ( word.length() == 0 || index[ i ].termProcessor != null && ! index[ i ].termProcessor.processTerm( word ) ) continue;
                if ( ( t = (int)( (BitStreamIndex)index[ i ] ).termMap.getLong( word ) ) == -1 ) LOGGER.error( index[ i ] + ": Could not find term " + word + " in term index" );
                else {
                  if ( index[ i ].hasCounts ) termsInDoc[ i ].put( t, termsInDoc[ i ].get( t ) + 1 );
                  if ( index[ i ].hasPositions ) wordInPos[ i ][ pos++ ] = t;
                }
              }

              if ( allBitStreamIndices ) {
                for( IntIterator x = termsInDoc[ i ].keySet().iterator(); x.hasNext(); ) {
                  t = x.nextInt();

                  IndexIterator indexIterator = indexReader[ i ].documents( t );

                  int pointer = indexIterator.skipTo( currDoc );
                  if ( pointer == currDoc ) {
                    if ( index[ i ].hasCounts ) {
                      int c = indexIterator.count();
                      if ( termsInDoc[ i ].get( t ) !=  c )
                        LOGGER.error( index[ i ] + ": The count for term " + t + " in document " + currDoc + " is " + c + " instead of " + termsInDoc[ i ].get( t ) );
                      else {
                        if ( index[ i ].hasPositions ) {
                          indexIterator.positions( occ[ i ] );

                          for( int j = 0; j < c; j++ )
                            if ( wordInPos[ i ][ occ[ i ][ j ] ] != t
                              LOGGER.error( index[ i ] + ": The occurrence of index " + i + " of term " + t + " (position " + occ[ i ] +") in document " + currDoc + " is occupied instead by term " + wordInPos[ i ][ occ[ i ][ j ] ] );
                        }
                      }
                    }
                  }
                  else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t + "(skipTo returned " + pointer + ")" );
                }
              }
              else {
                for( IntIterator x = termsInDoc[ i ].keySet().iterator(); x.hasNext(); ) {
                  t = x.nextInt();
                  IndexIterator indexIterator = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );

                  if ( indexIterator.skipTo( currDoc ) == currDoc ) {
                    if ( index[ i ].hasCounts ) {
                      int c = indexIterator.count();
                      if ( termsInDoc[ i ].get( t ) !=  c )
                        LOGGER.error( index[ i ] + ": The count for term " + t + " in document " + currDoc + " is " + c + " instead of " + termsInDoc[ i ].get( t ) );
                      else {
                        if ( index[ i ].hasPositions ) {
                          indexIterator.positions( occ[ i ] );

                          for( int j = 0; j < c; j++ )
                            if ( wordInPos[ i ][ occ[ i ][ j ] ] != t
                              LOGGER.error( index[ i ] + ": The occurrence of index " + i + " of term " + t + " (position " + occ[ i ] +") in document " + currDoc + " is occupied instead by term " + wordInPos[ i ][ occ[ i ][ j ] ] );
                        }
                      }
                    }
                  }
                  else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
                }
              }
            }
          }
          docCounter++;
          document.close();
          pl.update();
        }
      }
      else {
        LOGGER.warn( "Random access tests require very slow single-term scanning as not all indices are disk based" );

        it.unimi.dsi.mg4j.document.DocumentIterator documentIterator = documentSequence.iterator();
        Document document;
        Reader reader;
        WordReader wordReader;
       
        final MutableString word = new MutableString(), nonWord = new MutableString();
       
        int docCounter = 0;
       
        while( ( document = documentIterator.nextDocument() ) != null ) {
          currDoc = permutation != null ? permutation[ docCounter ] : docCounter;

          for( i = 0; i < index.length; i++ ) {
            Object content = document.content( stem || index[ i ].field == null ? indexedField[ i ] : factory.fieldIndex( index[ i ].field ) );
            if ( index[ i ].hasPayloads ) {
              if ( allBitStreamIndices ) {
                IndexIterator indexIterator = indexReader[ i ].documents( 0 );
                int pointer = indexIterator.skipTo( currDoc );
                if ( pointer == currDoc ) {
                  Payload payload = indexIterator.payload();
                  if ( ! payload.get().equals( content ) ) LOGGER.error( index[ i ] + ": Document " + currDoc + " has payload " + content + " but the index says " + payload )
                }
                else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
              }
              else {
                IndexIterator indexIterator = indexReader[ i ].documents( "#" );
                if ( indexIterator.skipTo( currDoc ) == currDoc ) {
                  if ( ! indexIterator.payload().get().equals( content ) )
                    LOGGER.error( index[ i ] + ": Document " + currDoc + " has payload " + content + " but the index says " + indexIterator.payload().get() );
                }
                else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
              }
            }
            else {
              pos = 0;
              reader = (Reader)content;
              wordReader = document.wordReader( stem || index[ i ].field == null ? indexedField[ i ] : factory.fieldIndex( index[ i ].field ) );
              wordReader.setReader( reader );
              while( wordReader.next( word, nonWord ) ) {
                if ( word.length() == 0 || index[ i ].termProcessor != null && ! index[ i ].termProcessor.processTerm( word ) ) continue;
                IndexIterator indexIterator = indexReader[ i ].documents( word );
                if ( currDoc != indexIterator.skipTo( currDoc ) )
                  LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + word );
                else if ( index[ i ].hasPositions ) {
View Full Code Here

    numberOfDocuments = 0;

    final DocumentIterator docIt = inputSequence.iterator();
    if ( factory != inputSequence.factory() ) throw new IllegalStateException( "The factory provided by the constructor does not correspond to the factory of the input sequence" );
    final int numberOfFields = factory.numberOfFields();
    WordReader wordReader;
    MutableString word = new MutableString();
    MutableString nonWord = new MutableString();
    open( "" );
    for (;;) {
      Document document = docIt.nextDocument();
      if ( document == null ) break;
      startDocument( document.title(), document.uri() );
     
      for ( int field = 0; field < numberOfFields; field++ ) {
        Object content = document.content( field );
        if ( factory.fieldType( field ) == FieldType.TEXT ) {
          startTextField();
          wordReader = document.wordReader( field );
          wordReader.setReader( (Reader)content );
          while ( wordReader.next( word, nonWord ) ) add( word, nonWord );
          endTextField();
        }
        else if ( factory.fieldType( field ) == FieldType.VIRTUAL ) virtualField( (ObjectList<VirtualDocumentFragment>)content );
        else nonTextField( content );
      }
View Full Code Here

    for ( int doc = 0; doc < coll.size(); doc++ ) {
      Document docum = coll.document( doc );
      for ( int i = 0; i < nfields; i++ ) {
        int field = fieldNumber[ i ];
        Reader content = (Reader)docum.content( field );
        WordReader wordReader = docum.wordReader( field );
        wordReader.setReader( content );
        StringTokenizer tok = new StringTokenizer( document[ doc ][ arrayIndex[ i ] ] );
        System.err.println( "Checking document " + doc + " field " + fieldName[ i ] + " (" + field + ")" );
        checkSameWords( wordReader, tok );
      }
      docum.close();
View Full Code Here

    int doc = 0;
    while ( ( docum = iterator.nextDocument() ) != null ) {
      for ( int i = 0; i < nfields; i++ ) {
        int field = fieldNumber[ i ];
        Reader content = (Reader)docum.content( field );
        WordReader wordReader = docum.wordReader( field );
        wordReader.setReader( content );
        StringTokenizer tok = new StringTokenizer( document[ doc ][ arrayIndex[ i ] ] );
        System.err.println( "Checking sequentially document " + doc + " field " + fieldName[ i ] + " (" + field + ")" );
        checkSameWords( wordReader, tok );
      }
      docum.close();
View Full Code Here

        for (int i = 0; i < min(k, results.size()); i++) {
          Document document = collection
              .document(results.get(i).document);
          Reader reader = (Reader) document.content(fieldIndex);
          WordReader wordReader = document.wordReader(fieldIndex);
          wordReader.setReader(reader);

          MutableString word = new MutableString();
          MutableString nonWord = new MutableString();
          final LongRBTreeSet set = new LongRBTreeSet();

          while (wordReader.next(word, nonWord)) {
            if (processor.processTerm(word)) {
              long termId = index.getTermId(word);
              if (termId >= 0)
                if (set.add(termId))
                  relFreq
View Full Code Here

        long unknown = index.getUnknownTermId();

        Multiset<Long> words = HashMultiset.create();

        for (int contentId : contents) {
            final WordReader reader = doc.wordReader(0);

            // Loop over terms
            while (reader.next(token, separator)) {
                final Long termId = index.getTermId(token);
                if (termId == unknown) continue;
                words.add(termId);
            }
        } // loop over content
View Full Code Here

TOP

Related Classes of it.unimi.dsi.io.WordReader

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.