Examples of it.unimi.dsi.mg4j.document.Document

it.unimi.dsi.mg4j.document.Document
An indexable document.
Instance of this class represent a single document. Documents provide access to possibly several fields, which represent units of information that should be indexed separately.
Each field is accessible by a call to {@link #content(int)}. Note, however, that unless specified otherwise field content must be accessed in increasing order. You can skip some field, but the contract of this class does not require that you can access fields in random order (although implementations may provide this feature). Moreover, the data provided by a call to {@link #content(int)} (e.g., a {@link java.io.Reader} for {@link it.unimi.dsi.mg4j.document.DocumentFactory.FieldType#TEXT TEXT} fields) may become invalidat the next call (similarly to the behaviour of {@link it.unimi.dsi.mg4j.document.DocumentCollection#document(int)}). The same holds for {@link #wordReader(int)}.
After obtaining a document, it is your responsibility to {@linkplain java.io.Closeable#close() close} it.
It is advisable, although not strictly required, that documents have a toString() equal to their title.

        final int document = dsi.document;
        
        output.print( "Document #" + document );
        output.printf( " [%.6f]", dsi.score );
        
        Document d = null; // Filled lazily
        
        // We try to print a title, preferring the supplied title list if present
        if ( titleList != null ) output.println( " " + titleList.get( document ) );
        else if ( documentCollection != null ) {
          d = documentCollection.document( document );
          output.println( " " + d.title().toString().trim() );
          d.close();
        }
        else output.println();
        
        if ( ( displayMode == OutputType.LONG || displayMode == OutputType.SNIPPET ) && dsi.info != null && queryEngine.intervalSelector != null ) {
          final Index[] sortedIndex = dsi.info.keySet().toArray( new Index[ 0 ] );
          if ( documentCollection != null ) Arrays.sort( sortedIndex, new Comparator<Index>() {
            public int compare( final Index i0, final Index i1 ) {
              return documentCollection.factory().fieldIndex( i0.field ) - documentCollection.factory().fieldIndex( i1.field );
            }} );
          for( Index index: sortedIndex ) 
            if ( index.hasPositions ) {
              SelectedInterval[] interval = dsi.info.get( index );
              if ( interval == SelectedInterval.TRUE_ARRAY ) output.println( index.field + ": TRUE" );
              else if ( interval == SelectedInterval.FALSE_ARRAY ) output.println( index.field + ": FALSE" );
              else if ( displayMode == OutputType.LONG || documentCollection == null ) output.println( index.field + ": " + Arrays.toString( interval ) );
              else { // SNIPPET_MODE
                final MarkingMutableString s = new MarkingMutableString( marker );
                s.startField( interval );
                // TODO: this must be in increasing field order
                if ( d == null ) d = documentCollection.document( document );
                int fieldIndex = documentCollection.factory().fieldIndex( index.field );
                if ( fieldIndex == -1 || documentCollection.factory().fieldType( fieldIndex ) != DocumentFactory.FieldType.TEXT ) continue;
                final Reader reader = (Reader)d.content( fieldIndex );
                s.appendAndMark( d.wordReader( fieldIndex ).setReader( reader ) );
                s.endField();
                d.close();
                output.println( index.field + ": " + s.toString() );
              }
            }
            else if ( index.hasPayloads && dsi.info.get( index ) == SelectedInterval.TRUE_ARRAY ) {
              if ( d == null ) d = documentCollection.document( document );
              int fieldIndex = documentCollection.factory().fieldIndex( index.field );
              if ( fieldIndex == -1 ) continue;
              output.println( d.content( fieldIndex ) );
            }
          output.println();
        }
      }
    }

View Full Code Here

            LOGGER.debug( "Intervals for item " + i );
            final ResultItem resultItem = new ResultItem( dsi.document, dsi.score );
            resultItems.add( resultItem );


            if ( collection != null ) {
              final Document document = collection.document( dsi.document );
              // If both collection and title list are present, we override the collection title (cfr. Query)
              resultItem.title = StringEscapeUtils.escapeHtml( titleList != null ? titleList.get( resultItem.doc ).toString() : document.title().toString() );
              if ( useUri ) {
                if ( document.uri() != null ) resultItem.uri = StringEscapeUtils.escapeHtml( document.uri().toString() );
              }
              else {
                if ( document.uri() != null ) {
                  String stringUri = document.uri().toString();
                  // TODO: this is a quick patch to get the file server running with relative files
                  final String documentUri = URLEncoder.encode( derelativise
                  ? new File( stringUri.startsWith( "file:" ) ? stringUri.substring( 5 ) : stringUri ).getAbsoluteFile().toURI().toASCIIString()
                      : document.uri().toString(), "UTF-8" );
                  resultItem.uri = StringEscapeUtils.escapeHtml( "./Item?doc=" + resultItem.doc + "&m=" + urlEncodedMimeType + "&uri=" + documentUri );
                }
                else resultItem.uri = StringEscapeUtils.escapeHtml( "./Item?doc=" + resultItem.doc + "&m=" + urlEncodedMimeType );
              }
              
              MarkingMutableString snippet = new MarkingMutableString( TextMarker.HTML_STRONG, MarkingMutableString.HTML_ESCAPE ); 
              
              for( int j = 0; j < sortedIndex.length; j++ ) {
                if ( ! sortedIndex[ j ].hasPositions || dsi.info == null ) continue;
                selectedInterval = dsi.info.get( sortedIndex[ j ] );
                if ( selectedInterval != null ) {
                  final int field = documentCollection.factory().fieldIndex( sortedIndex[ j ].field );
                  // If the field is not present (e.g., because of parallel indexing) or it is not text we skip
                  if ( field == -1 || documentCollection.factory().fieldType( field ) != DocumentFactory.FieldType.TEXT ) continue;
                  LOGGER.debug( "Found intervals for " + sortedIndex[ j ].field + " (" + field + ")" );
                  final Reader content = (Reader)document.content( field );
                  snippet.startField( selectedInterval ).appendAndMark( document.wordReader( field ).setReader( content ) ).endField();
                }
                if ( LOGGER.isDebugEnabled() ) LOGGER.debug( sortedIndex[ j ].field + ": " + ( selectedInterval == null ? null : Arrays.asList( selectedInterval ) ) ); 
                document.close();
              }
              
              resultItem.text = snippet; 
            }
            else {

View Full Code Here


    DocumentIterator iterator = documentSequence.iterator();
    Reader reader;
    WordReader wordReader;
    ObjectList<VirtualDocumentFragment> fragments;
    Document document;


    int documentPointer = 0, documentsInBatch = 0;
    long batchStartTime = System.currentTimeMillis();
    boolean outOfMemoryError = false;


    while ( ( document = iterator.nextDocument() ) != null ) {
      
      long overallTerms = 0;
      if ( building ) builder.startDocument( document.title(), document.uri() );
      for ( int i = 0; i < numberOfIndexedFields; i++ ) {
        switch ( factory.fieldType( indexedField[ i ] ) ) {
        case TEXT:
          reader = (Reader)document.content( indexedField[ i ] );
          wordReader = document.wordReader( indexedField[ i ] );
          wordReader.setReader( reader );
          if ( building ) builder.startTextField();
          scan[ i ].processDocument( map != null ? map[ documentPointer ] : documentPointer, wordReader );
          if ( building ) builder.endTextField();
          overallTerms += scan[ i ].numTerms;
          break;
        case VIRTUAL:
          fragments = (ObjectList<VirtualDocumentFragment>)document.content( indexedField[ i ] );
          wordReader = document.wordReader( indexedField[ i ] );
          virtualDocumentResolver[ i ].context( document );
          for( VirtualDocumentFragment fragment: fragments ) {
            int virtualDocumentPointer = virtualDocumentResolver[ i ].resolve( fragment.documentSpecifier() );
            if ( virtualDocumentPointer < 0 ) continue;
            if ( map != null ) virtualDocumentPointer = map[ virtualDocumentPointer ];
            wordReader.setReader( new FastBufferedReader( fragment.text() ) );
            scan[ i ].processDocument( virtualDocumentPointer, wordReader );
          }
          if ( building ) builder.virtualField( fragments );
          overallTerms += scan[ i ].numTerms;
          break;
        default:
          Object o = document.content( indexedField[ i ] );
          accumulator[ i ].processData( map != null ? map[ documentPointer ] : documentPointer, o );
          if ( building ) builder.nonTextField( o );
          break;
        }


        if ( scan[ i ] != null && scan[ i ].outOfMemoryError ) outOfMemoryError = true;
      }
      if ( building ) builder.endDocument();
      documentPointer++;
      documentsInBatch++;
      document.close();
      pl.update();


      // We try compaction if we detect less than PERC_AVAILABLE_MEMORY_CHECK memory available
      long percAvailableMemory = Util.percAvailableMemory();
      boolean compacted = false;

View Full Code Here

  public Template handleRequest( final HttpServletRequest request, final HttpServletResponse response, final Context context ) throws Exception {
    if ( request.getParameter( "doc" ) != null ) {
      DocumentCollection collection = (DocumentCollection)getServletContext().getAttribute( "collection" );
      response.setContentType( request.getParameter( "m" ) );
      response.setCharacterEncoding( "UTF-8" );
      final Document document = collection.document( Integer.parseInt( request.getParameter( "doc" ) ) );
      final DocumentFactory factory = collection.factory();
      final ObjectArrayList<String> fields = new ObjectArrayList<String>();
      final int numberOfFields = factory.numberOfFields();
      
      LOGGER.debug( "ParsingFactory declares " + numberOfFields + " fields"  );
      
      for( int field = 0; field < numberOfFields; field++ ) {
        if ( factory.fieldType( field ) != FieldType.TEXT ) fields.add( StringEscapeUtils.escapeHtml( document.content( field ).toString() ) );
        else fields.add( StringEscapeUtils.escapeHtml( IOUtils.toString( (Reader)document.content( field ) ) ).replaceAll( "\n", "<br>\n" ) );
      }
      context.put( "title", document.title() );
      context.put( "fields", fields );
      context.put( "factory", factory );
      return getTemplate( "it/unimi/dsi/mg4j/query/generic.velocity" );
    }

View Full Code Here

    
    Util.ensureLog4JIsConfigured();


    final DocumentIterator documentIterator = documentSequence.iterator();


    Document document;
    FastBufferedOutputStream uriStream = null, titleStream = null;
    
    if ( jsapResult.userSpecified( "uris" ) ) uriStream = new FastBufferedOutputStream( new FileOutputStream( jsapResult.getString( "uris" ) ) );
    if ( jsapResult.userSpecified( "titles" ) ) titleStream = new FastBufferedOutputStream( new FileOutputStream( jsapResult.getString( "titles" ) ) );
    
    MutableString s = new MutableString();


    ProgressLogger progressLogger = new ProgressLogger( LOGGER, jsapResult.getLong( "logInterval" ), "documents" );
    if ( documentSequence instanceof DocumentCollection ) progressLogger.expectedUpdates = ((DocumentCollection)documentSequence).size();
    progressLogger.start( "Scanning..." );
    
    while( ( document = documentIterator.nextDocument() ) != null ) {
      if ( uriStream != null ) {
        s.replace( document.uri() );
        s.replace( LINE_TERMINATORS, SPACES );
        s.writeUTF8( uriStream );
        uriStream.write( '\n' );
      }
      if ( titleStream != null ) {
        s.replace( document.title() );
        s.replace( LINE_TERMINATORS, SPACES );
        s.writeUTF8( titleStream );
        titleStream.write( '\n' );
      }
      progressLogger.lightUpdate();

View Full Code Here

      pl.itemsName = "documents";
      pl.start( "Verifying random access..." );


      if ( allBitStreamIndices ) {
        it.unimi.dsi.mg4j.document.DocumentIterator documentIterator = documentSequence.iterator();
        Document document;
        Reader reader;
        WordReader wordReader;
        
        final MutableString word = new MutableString(), nonWord = new MutableString();
        
        int docCounter = 0;
        
        while( ( document = documentIterator.nextDocument() ) != null ) {
          currDoc = permutation != null ? permutation[ docCounter ] : docCounter;


          for( i = 0; i < index.length; i++ ) {
            Object content = document.content( stem || index[ i ].field == null ? indexedField[ i ] : factory.fieldIndex( index[ i ].field ) );
            if ( index[ i ].hasPayloads ) {
              // TODO: write tests for the other case
              if ( allBitStreamIndices ) {
                IndexIterator indexIterator = indexReader[ i ].documents( 0 );
                int pointer = indexIterator.skipTo( currDoc );
                if ( pointer == currDoc ) {
                  Payload payload = indexIterator.payload();
                  if ( ! payload.get().equals( content ) ) LOGGER.error( index[ i ] + ": Document " + currDoc + " has payload " + content + " but the index says " + payload );  
                }
                else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
              }
              else {
                IndexIterator indexIterator = indexReader[ i ].documents(  0  );
                if ( indexIterator.skipTo( currDoc ) == currDoc ) {
                  if ( ! indexIterator.payload().get().equals( content ) )
                    LOGGER.error( index[ i ] + ": Document " + currDoc + " has payload " + content + " but the index says " + indexIterator.payload().get() );
                } 
                else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
              }
            }
            else {
              // text index
              pos = 0;
              termsInDoc[ i ].clear();
              reader = (Reader)content;
              wordReader = document.wordReader( stem || index[ i ].field == null ? indexedField[ i ] : factory.fieldIndex( index[ i ].field ) );
              wordReader.setReader( reader );
              while( wordReader.next( word, nonWord ) ) {
                if ( word.length() == 0 || index[ i ].termProcessor != null && ! index[ i ].termProcessor.processTerm( word ) ) continue;
                if ( ( t = (int)( (BitStreamIndex)index[ i ] ).termMap.getLong( word ) ) == -1 ) LOGGER.error( index[ i ] + ": Could not find term " + word + " in term index" );
                else {
                  if ( index[ i ].hasCounts ) termsInDoc[ i ].put( t, termsInDoc[ i ].get( t ) + 1 );
                  if ( index[ i ].hasPositions ) wordInPos[ i ][ pos++ ] = t;
                }
              }


              if ( allBitStreamIndices ) {
                for( IntIterator x = termsInDoc[ i ].keySet().iterator(); x.hasNext(); ) {
                  t = x.nextInt();


                  IndexIterator indexIterator = indexReader[ i ].documents( t );


                  int pointer = indexIterator.skipTo( currDoc );
                  if ( pointer == currDoc ) {
                    if ( index[ i ].hasCounts ) {
                      int c = indexIterator.count();
                      if ( termsInDoc[ i ].get( t ) !=  c ) 
                        LOGGER.error( index[ i ] + ": The count for term " + t + " in document " + currDoc + " is " + c + " instead of " + termsInDoc[ i ].get( t ) );
                      else {
                        if ( index[ i ].hasPositions ) {
                          indexIterator.positions( occ[ i ] );


                          for( int j = 0; j < c; j++ ) 
                            if ( wordInPos[ i ][ occ[ i ][ j ] ] != t )  
                              LOGGER.error( index[ i ] + ": The occurrence of index " + i + " of term " + t + " (position " + occ[ i ] +") in document " + currDoc + " is occupied instead by term " + wordInPos[ i ][ occ[ i ][ j ] ] );
                        }
                      }
                    } 
                  }
                  else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t + "(skipTo returned " + pointer + ")" );
                }
              }
              else {
                for( IntIterator x = termsInDoc[ i ].keySet().iterator(); x.hasNext(); ) {
                  t = x.nextInt();
                  IndexIterator indexIterator = termLists ? indexReader[ i ].documents( terms[ i ].get( t ) ) : indexReader[ i ].documents( t );


                  if ( indexIterator.skipTo( currDoc ) == currDoc ) {
                    if ( index[ i ].hasCounts ) {
                      int c = indexIterator.count();
                      if ( termsInDoc[ i ].get( t ) !=  c ) 
                        LOGGER.error( index[ i ] + ": The count for term " + t + " in document " + currDoc + " is " + c + " instead of " + termsInDoc[ i ].get( t ) );
                      else {
                        if ( index[ i ].hasPositions ) {
                          indexIterator.positions( occ[ i ] );


                          for( int j = 0; j < c; j++ ) 
                            if ( wordInPos[ i ][ occ[ i ][ j ] ] != t )  
                              LOGGER.error( index[ i ] + ": The occurrence of index " + i + " of term " + t + " (position " + occ[ i ] +") in document " + currDoc + " is occupied instead by term " + wordInPos[ i ][ occ[ i ][ j ] ] );
                        }
                      }
                    }
                  } 
                  else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
                }
              }
            }
          }
          docCounter++;
          document.close();
          pl.update();
        }
      }
      else {
        LOGGER.warn( "Random access tests require very slow single-term scanning as not all indices are disk based" );


        it.unimi.dsi.mg4j.document.DocumentIterator documentIterator = documentSequence.iterator();
        Document document;
        Reader reader;
        WordReader wordReader;
        
        final MutableString word = new MutableString(), nonWord = new MutableString();
        
        int docCounter = 0;
        
        while( ( document = documentIterator.nextDocument() ) != null ) {
          currDoc = permutation != null ? permutation[ docCounter ] : docCounter;


          for( i = 0; i < index.length; i++ ) {
            Object content = document.content( stem || index[ i ].field == null ? indexedField[ i ] : factory.fieldIndex( index[ i ].field ) );
            if ( index[ i ].hasPayloads ) {
              if ( allBitStreamIndices ) {
                IndexIterator indexIterator = indexReader[ i ].documents( 0 );
                int pointer = indexIterator.skipTo( currDoc );
                if ( pointer == currDoc ) {
                  Payload payload = indexIterator.payload();
                  if ( ! payload.get().equals( content ) ) LOGGER.error( index[ i ] + ": Document " + currDoc + " has payload " + content + " but the index says " + payload );  
                }
                else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
              }
              else {
                IndexIterator indexIterator = indexReader[ i ].documents( "#" );
                if ( indexIterator.skipTo( currDoc ) == currDoc ) {
                  if ( ! indexIterator.payload().get().equals( content ) )
                    LOGGER.error( index[ i ] + ": Document " + currDoc + " has payload " + content + " but the index says " + indexIterator.payload().get() );
                } 
                else LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + t );
              }
            }
            else {
              pos = 0;
              reader = (Reader)content;
              wordReader = document.wordReader( stem || index[ i ].field == null ? indexedField[ i ] : factory.fieldIndex( index[ i ].field ) );
              wordReader.setReader( reader );
              while( wordReader.next( word, nonWord ) ) {
                if ( word.length() == 0 || index[ i ].termProcessor != null && ! index[ i ].termProcessor.processTerm( word ) ) continue;
                IndexIterator indexIterator = indexReader[ i ].documents( word );
                if ( currDoc != indexIterator.skipTo( currDoc ) )
                  LOGGER.error( index[ i ] + ": Document " + currDoc + " does not appear in the inverted list of term " + word );
                else if ( index[ i ].hasPositions ) {
                  indexIterator.positions( occ[ i ] );
                  if ( IntArrayList.wrap( occ[ i ], indexIterator.count() ).indexOf( pos ) == -1 )
                    LOGGER.error( index[ i ] + ": Position " + pos + " does not appear in the position list of term " + word + " in document " + currDoc );
                }
                pos++;
              }
            }
          }
          document.close();
          pl.update();
          docCounter++;
        }
      }

View Full Code Here

   */
  @SuppressWarnings("unchecked")
  public void checkAgainstContent( DocumentSequence sequence, int[] map, VirtualDocumentResolver resolver, int gap, Index... index ) throws IOException {
    DocumentIterator iterator = sequence.iterator();
    DocumentFactory factory = sequence.factory();
    Document document;
    final int n = index.length;
    final int[] field = new int[ n ];
    final int[][] currMaxPos = new int[ n ][];
    final int[] maxDoc = new int[ n ];
    IntArrays.fill( maxDoc, -1 );
    final Object2ObjectOpenHashMap<MutableString, ObjectArrayList<int[]>>[] termMap = new Object2ObjectOpenHashMap[ n ];
    final IntArrayList[] payloadPointers = new IntArrayList[ n ];
    final ObjectArrayList<Object>[] payloadContent = new ObjectArrayList[ n ];


    for ( int i = 0; i < n; i++ ) {
      field[ i ] = factory.fieldIndex( index[ i ].field );
      switch ( factory.fieldType( field[ i ] ) ) {
      case VIRTUAL:
        currMaxPos[ i ] = new int[ resolver.numberOfDocuments() ];
      case TEXT:
        termMap[ i ] = new Object2ObjectOpenHashMap<MutableString, ObjectArrayList<int[]>>();
        break;
      case DATE:
      case INT:
        payloadPointers[ i ] = new IntArrayList();
        payloadContent[ i ] = new ObjectArrayList<Object>();
      }
    }


    int documentIndex = 0;


    while ( ( document = iterator.nextDocument() ) != null ) {
      for ( int i = 0; i < field.length; i++ ) {
        switch ( factory.fieldType( field[ i ] ) ) {
        case TEXT:
          processDocument( document.wordReader( field[ i ] ).setReader( (Reader)document.content( field[ i ] ) ), map == null ? documentIndex : map[ documentIndex ], 0, termMap[ i ],
              index[ i ].termProcessor );
          break;
        case VIRTUAL:
          ObjectArrayList<VirtualDocumentFragment> fragments = (ObjectArrayList<VirtualDocumentFragment>)document.content( field[ i ] );
          resolver.context( document );
          for ( VirtualDocumentFragment fragment : fragments ) {
            int d = resolver.resolve( fragment.documentSpecifier() );


            if ( d != -1 ) {
              if ( map != null ) d = map[ d ];
              if ( maxDoc[ i ] < d ) maxDoc[ i ] = d;
              currMaxPos[ i ][ d ] = processDocument( document.wordReader( field[ i ] ).setReader( new FastBufferedReader( fragment.text() ) ), d, currMaxPos[ i ][ d ], termMap[ i ],
                  index[ i ].termProcessor )
                  + gap;
            }
          }
          break;
        case INT:
        case DATE:
          Object x = document.content( field[ i ] );
          if ( x != null ) {
            payloadPointers[ i ].add( map == null ? documentIndex : map[ documentIndex ] );
            payloadContent[ i ].add( x );
          }
        default:
        }
      }
      document.close();
      documentIndex++;
    }


    iterator.close();

View Full Code Here

  public int size() {
    return virtual.length;
  }


  public Document document( final int index ) {
    return new Document() {
      public void close() {}
      public Object content( int field ) throws IOException {
        ensureDocumentIndex( index );
        ObjectArrayList<Anchor> result = new ObjectArrayList<Anchor>();
        for( Map.Entry<Integer, ? extends CharSequence> entry: virtual[ index ].entrySet() )

View Full Code Here

        4, // Very small, to induce fragmentation
        false);


    try {
      DocumentIterator iter = collection.iterator();
      Document d;
      while ((d = iter.nextDocument()) != null)
        d.title();
    } catch (IllegalStateException e) {
      assertTrue(false);
    }


  }

View Full Code Here

                new HtmlDocumentFactory( new String[] { "encoding=ISO-8859-1" } ) } ),
        4, // Very small, to induce fragmentation
        false);


    DocumentIterator iter = collection.iterator();
    Document d = null;


    d = iter.nextDocument();
    assertNotNull(d);
    assertEquals("http://gx0001/", d.uri());
    assertEquals("GX001", d.title());


    final int textIndex = collection.factory().fieldIndex( "text" );
    
    assertEquals( "Line 1\n     The line 2!\n  Mamma\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );
    
    d = iter.nextDocument();
    assertNotNull(d);
    assertEquals("http://gx0002/", d.uri());
    assertEquals("GX002", d.title());


    assertEquals( "Contents of this file reside on one line only\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );


    d = iter.nextDocument();
    assertNotNull(d);
    assertEquals("http://gx0003/", d.uri());
    assertEquals("GX003", d.title());


    assertEquals( "Line 1\nLine 2\nLine 3\nLine 4\nLine 5\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );


    d = iter.nextDocument();
    assertNotNull(d);
    assertEquals("http://gx0004/", d.uri());
    assertEquals("GX004", d.title());


    assertEquals( "New content 0\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );
    
    d = iter.nextDocument();
    assertNotNull(d);
    assertEquals("http://gx0005/", d.uri());
    assertEquals("GX005", d.title());


    assertEquals( "New content 1\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );


    d = iter.nextDocument();
    assertNotNull(d);
    assertEquals("http://gx0006/", d.uri());
    assertEquals("GX006", d.title());


    assertEquals( "New content 2\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );


    d = iter.nextDocument();
    assertNotNull(d);
    assertEquals("http://gx0007/", d.uri());
    assertEquals("GX007", d.title());


    assertEquals( "", IOUtils.toString( (Reader)d.content( textIndex ) ) );


    d = iter.nextDocument();
    assertNull(d);
    iter.close();
    
    d = collection.document( 0 );
    assertNotNull(d);
    assertEquals("http://gx0001/", d.uri());
    assertEquals("GX001", d.title());


    assertEquals( "Line 1\n     The line 2!\n  Mamma\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );
    d.close();
    
    d = collection.document( 1 );
    assertNotNull(d);
    assertEquals("http://gx0002/", d.uri());
    assertEquals("GX002", d.title());


    assertEquals( "Contents of this file reside on one line only\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );
    d.close();


    d = collection.document( 2 );
    assertNotNull(d);
    assertEquals("http://gx0003/", d.uri());
    assertEquals("GX003", d.title());


    assertEquals( "Line 1\nLine 2\nLine 3\nLine 4\nLine 5\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );
    d.close();


    d = collection.document( 3 );
    assertNotNull(d);
    assertEquals("http://gx0004/", d.uri());
    assertEquals("GX004", d.title());


    assertEquals( "New content 0\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );
    d.close();
    
    d = collection.document( 4 );
    assertNotNull(d);
    assertEquals("http://gx0005/", d.uri());
    assertEquals("GX005", d.title());


    assertEquals( "New content 1\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );
    d.close();


    d = collection.document( 5 );
    assertNotNull(d);
    assertEquals("http://gx0006/", d.uri());
    assertEquals("GX006", d.title());


    assertEquals( "New content 2\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );
    d.close();


    d = collection.document( 6 );
    assertNotNull(d);
    assertEquals("http://gx0007/", d.uri());
    assertEquals("GX007", d.title());


    assertEquals( "", IOUtils.toString( (Reader)d.content( textIndex ) ) );
    d.close();


  }

View Full Code Here

0 1

TOP

Related Classes of it.unimi.dsi.mg4j.document.Document

it.unimi.dsi.mg4j.document.DateArrayDocumentCollection

it.unimi.dsi.mg4j.document.IntArrayDocumentCollection

it.unimi.dsi.mg4j.document.MapVirtualDocumentCollection

it.unimi.dsi.mg4j.document.TRECDocumentCollectionTest

it.unimi.dsi.mg4j.query.GenericItem

it.unimi.dsi.mg4j.query.Query

it.unimi.dsi.mg4j.query.QueryServlet

it.unimi.dsi.mg4j.test.Verifier

it.unimi.dsi.mg4j.tool.IndexTest

it.unimi.dsi.mg4j.tool.Scan

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.