Package it.unimi.dsi.mg4j.document

Examples of it.unimi.dsi.mg4j.document.TRECDocumentCollection$TRECDocumentDescriptor


    OutputStream outputStream = new FileOutputStream(temp);
    IOUtils.copy(this.getClass().getResourceAsStream("testChar255.data"),
        outputStream);
    outputStream.close();

    TRECDocumentCollection collection = new TRECDocumentCollection(
        new String[] { temp.toString() },
        CompositeDocumentFactory
            .getFactory(new DocumentFactory[] {
                new TRECHeaderDocumentFactory(),
                new HtmlDocumentFactory( new String[] { "encoding=ISO-8859-1" } ) } ),
        4, // Very small, to induce fragmentation
        false);

    try {
      DocumentIterator iter = collection.iterator();
      Document d;
      while ((d = iter.nextDocument()) != null)
        d.title();
    } catch (IllegalStateException e) {
      assertTrue(false);
View Full Code Here


    IOUtils.copy( this.getClass().getResourceAsStream( "testContents.data" ), outputStream );
    outputStream.close();
    IOUtils.copy( this.getClass().getResourceAsStream( "testContentsAgain.data" ), outputStreamAgain );
    outputStreamAgain.close();

    TRECDocumentCollection collection = new TRECDocumentCollection(
        new String[] { temp.toString(), tempAgain.toString() },
        CompositeDocumentFactory
            .getFactory(new DocumentFactory[] {
                new TRECHeaderDocumentFactory(),
                new HtmlDocumentFactory( new String[] { "encoding=ISO-8859-1" } ) } ),
        4, // Very small, to induce fragmentation
        false);

    DocumentIterator iter = collection.iterator();
    Document d = null;

    d = iter.nextDocument();
    assertNotNull(d);
    assertEquals("http://gx0001/", d.uri());
    assertEquals("GX001", d.title());

    final int textIndex = collection.factory().fieldIndex( "text" );
   
    assertEquals( "Line 1\n     The line 2!\n  Mamma\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );
   
    d = iter.nextDocument();
    assertNotNull(d);
    assertEquals("http://gx0002/", d.uri());
    assertEquals("GX002", d.title());

    assertEquals( "Contents of this file reside on one line only\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );

    d = iter.nextDocument();
    assertNotNull(d);
    assertEquals("http://gx0003/", d.uri());
    assertEquals("GX003", d.title());

    assertEquals( "Line 1\nLine 2\nLine 3\nLine 4\nLine 5\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );

    d = iter.nextDocument();
    assertNotNull(d);
    assertEquals("http://gx0004/", d.uri());
    assertEquals("GX004", d.title());

    assertEquals( "New content 0\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );
   
    d = iter.nextDocument();
    assertNotNull(d);
    assertEquals("http://gx0005/", d.uri());
    assertEquals("GX005", d.title());

    assertEquals( "New content 1\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );

    d = iter.nextDocument();
    assertNotNull(d);
    assertEquals("http://gx0006/", d.uri());
    assertEquals("GX006", d.title());

    assertEquals( "New content 2\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );

    d = iter.nextDocument();
    assertNotNull(d);
    assertEquals("http://gx0007/", d.uri());
    assertEquals("GX007", d.title());

    assertEquals( "", IOUtils.toString( (Reader)d.content( textIndex ) ) );

    d = iter.nextDocument();
    assertNull(d);
    iter.close();
   
    d = collection.document( 0 );
    assertNotNull(d);
    assertEquals("http://gx0001/", d.uri());
    assertEquals("GX001", d.title());

    assertEquals( "Line 1\n     The line 2!\n  Mamma\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );
    d.close();
   
    d = collection.document( 1 );
    assertNotNull(d);
    assertEquals("http://gx0002/", d.uri());
    assertEquals("GX002", d.title());

    assertEquals( "Contents of this file reside on one line only\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );
    d.close();

    d = collection.document( 2 );
    assertNotNull(d);
    assertEquals("http://gx0003/", d.uri());
    assertEquals("GX003", d.title());

    assertEquals( "Line 1\nLine 2\nLine 3\nLine 4\nLine 5\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );
    d.close();

    d = collection.document( 3 );
    assertNotNull(d);
    assertEquals("http://gx0004/", d.uri());
    assertEquals("GX004", d.title());

    assertEquals( "New content 0\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );
    d.close();
   
    d = collection.document( 4 );
    assertNotNull(d);
    assertEquals("http://gx0005/", d.uri());
    assertEquals("GX005", d.title());

    assertEquals( "New content 1\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );
    d.close();

    d = collection.document( 5 );
    assertNotNull(d);
    assertEquals("http://gx0006/", d.uri());
    assertEquals("GX006", d.title());

    assertEquals( "New content 2\n", IOUtils.toString( (Reader)d.content( textIndex ) ) );
    d.close();

    d = collection.document( 6 );
    assertNotNull(d);
    assertEquals("http://gx0007/", d.uri());
    assertEquals("GX007", d.title());

    assertEquals( "", IOUtils.toString( (Reader)d.content( textIndex ) ) );
View Full Code Here

TOP

Related Classes of it.unimi.dsi.mg4j.document.TRECDocumentCollection$TRECDocumentDescriptor

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.