Source Code of it.unimi.dsi.mg4j.document.DocumentCollectionTest

package it.unimi.dsi.mg4j.document;


/*     
 * MG4J: Managing Gigabytes for Java
 *
 * Copyright (C) 2005-2010 Paolo Boldi 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
 *
 */




import it.unimi.dsi.fastutil.Arrays;
import it.unimi.dsi.fastutil.Swapper;
import it.unimi.dsi.fastutil.ints.AbstractIntComparator;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.io.WordReader;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.util.Properties;


import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.io.Writer;
import java.util.StringTokenizer;


import junit.framework.TestCase;


import org.apache.commons.configuration.ConfigurationException;
import org.apache.commons.io.FileUtils;




public class DocumentCollectionTest extends TestCase {


  /* We consider documents abstractly described by two fields each. 
   * 
   * WARNING: the first string MUST be a prefix of the second string. */
  private final static String[][] document = new String[][] {
      //              0   1   2   3      0   1   2   3   4   5   6   7   8   9   10  11  12  13  14
      new String[] { "xxx yyy zzz xxx", "xxx yyy zzz xxx aaa xxx aaa yyy aaa yyy aaa zzz aaa www aaa" },
      new String[] { "aaa xxx aaa aaa", "aaa xxx aaa aaa xxx aaa zzz uuu" },
      new String[] { "aaa uuu aaa"    , "aaa uuu aaa xxx xxx xxx aaa xxx" },
      // This tests that zipped collections handle properly initial spaces and
      // that word readers are propagated correctly.
      new String[] { " aaa uuu aaa"    , " aaa uuu aaa _ __ xxx _ xxx xxx aaa xxx" },
  };


  private final static String[][] document2 = new String[][] {
    //              0   1   2   3      0   1   2   3   4   5   6   7   8   9   10  11  12  13  14
    new String[] { "xxx yyy zzz xxx", "xxx yyy zzz xxx aaa xxx aaa yyy aaa yyy aaa zzz aaa www aaa" },
    new String[] { "aaa xxx aaa aaa", "aaa xxx aaa aaa xxx aaa zzz uuu" },
    new String[] { "aaa uuu aaa"    , "aaa uuu aaa xxx xxx xxx aaa xxx" },
    // This tests that zipped collections handle properly initial spaces and
    // that word readers are propagated correctly.
    new String[] { " aaa uuu aaa"    , " aaa uuu aaa _ __ xxx _ xxx xxx aaa xxx" },
    new String[] { "xxx yyy zzz xxx", "xxx yyy zzz xxx aaa xxx aaa yyy aaa yyy aaa zzz aaa www aaa" },
    new String[] { "aaa xxx aaa aaa", "aaa xxx aaa aaa xxx aaa zzz uuu" },
    new String[] { "aaa uuu aaa"    , "aaa uuu aaa xxx xxx xxx aaa xxx" },
    // This tests that zipped collections handle properly initial spaces and
    // that word readers are propagated correctly.
    new String[] { " aaa uuu aaa"    , " aaa uuu aaa _ __ xxx _ xxx xxx aaa xxx" },
  };


  private final static Properties DEFAULT_PROPERTIES = new Properties();
  static {
    DEFAULT_PROPERTIES.setProperty( PropertyBasedDocumentFactory.MetadataKeys.ENCODING, "ASCII" );
    DEFAULT_PROPERTIES.setProperty( PropertyBasedDocumentFactory.MetadataKeys.WORDREADER, it.unimi.dsi.io.FastBufferedReader.class.getName() + "(_)" );
  }
  
  /** The number of documents. */
  private final static int ndoc = document.length;
  /** The temporary directory where all tests are run. */
  private File tempDir;
  /** The set of files in the HTML directory. */
  private String[] htmlFileSet;
  
  /** Given a two-field document, produce an HTML document with the first field as title and
   *  the second field as body.
   *  
   *  @param document the document.
   *  @return the HTML version of the document.
   */  private String getHTMLDocument( String[] document ) {
    MutableString res = new MutableString();
    res.append( "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Strict//EN\" \"http://www.w3.org/TR/REC-html40/strict.dtd\">\n" );
    res.append( "<HTML>\n<HEAD>\n<TITLE>" + document[ 0 ] + "</TITLE>\n" );
    // Do NOT append the first part of the body
    res.append( "<BODY>\n" + document[ 1 ].substring( document[ 0 ].length() ) );
    res.append( "\n</BODY>\n" );
    res.append( "</HTML>" );
    return res.toString();
  }
  
  /** Given a two-field document, produce a mbox document with the first field as subject and
   *  the second field as body.
   *  
   *  @param document the document.
   *  @return the HTML version of the document.
   */
  private String getMboxDocument( String[] document ) {
    MutableString res = new MutableString();
    res.append( "From MAILER-DAEMON Fri Apr 15 16:22:32 2005\n" );
    res.append( "Date: 15 Apr 2005 16:22:32 +0200\n" );
    res.append( "From: Mail System Internal Data <MAILER-DAEMON@sliver.usr.dsi.unimi.it>\n" );
    res.append( "Subject: " + document[ 0 ] + "\n" );
    res.append( "Message-ID: <1113574952@sliver.usr.dsi.unimi.it>\n" );
    res.append( "X-IMAP: 1102967122 0000138458\n" );
    res.append( "Return-Path: <matteo.xxx@unimi.it>\n" );
    res.append( "Received: from localhost (localhost.localdomain [127.0.0.1])\n" );
        res.append( "\tby sliver.usr.dsi.unimi.it (8.12.11/8.12.11) with ESMTP id iAUNtadn007305\n");
        res.append( "\tfor <vigna@localhost>; Wed, 1 Dec 2004 00:55:36 +0100\n" );
        res.append( "Received: from law5.usr.dsi.unimi.it [159.149.146.241]\n" );
        res.append( "\tby localhost with IMAP (fetchmail-6.2.5)\n" );
        res.append( "\tfor vigna@localhost (single-drop); Wed, 01 Dec 2004 00:55:36 +0100 (CET)\n" );
        res.append( "To: vigna@dsi.unimi.it\n" );
        res.append( "Message-id: <Pine.WNT.4.33.0412010051240.-209505@p233-mmx>\n" );
        res.append( "Content-type: TEXT/PLAIN; charset=iso-8859-15\n" );
        res.append( "X-Warning: UNAuthenticated Sender\n" );
        res.append( "Content-Transfer-Encoding: 8bit\n" );
        res.append( "Content-Length: " + document[ 1 ].length() + "\n" );
    res.append( "\n" );
    res.append( document[ 1 ] + "\n" );
    return res.toString();
  }


  
  /** Checks that the tokenizer and the word reader return exactly the same sequence of words. 
   * 
   * @param wordReader the word reader.
   * @param tok the tokenizer.
   * @throws IOException
   */
  private void checkSameWords( WordReader wordReader, StringTokenizer tok ) throws IOException {
    MutableString word = new MutableString();
    MutableString nonWord = new MutableString();
    boolean aWordInDocum, aWordInDocument;
    boolean firstTime = true;
    for (;;) {
      aWordInDocum = wordReader.next( word, nonWord );
      if ( firstTime ) {
        firstTime = false;
        if ( word.equals( "" ) ) continue;
      }
      assertFalse( aWordInDocum && word.equals( "" ) );
      aWordInDocument = tok.hasMoreElements();
      assertEquals( aWordInDocum, aWordInDocument );
      if ( !aWordInDocum ) break;
      assertEquals( tok.nextElement(), word.toString() );
    }
  }
  
  /** Checks that the documents in the collection have the same sequence of words as in
   *  document: the names of the fields to be checked are specified in the array.
   *  
   * @param coll the collection.
   * @param fieldName the field names.
   * @param document documents to be checked against.
   * @throws IOException
   */
  private void checkAllDocuments( final DocumentCollection coll, final String[] fieldName, final String[][] document ) throws IOException {
    final int nfields = fieldName.length;
    final int[] fieldNumber = new int[ nfields ];
    final int[] arrayIndex = new int[ nfields ];
    // Look for field indices
    for ( int i = 0; i < nfields; i++ ) {
      arrayIndex[ i ] = i;
      int j;
      for ( j = 0; j < coll.factory().numberOfFields(); j++ )
        if ( coll.factory().fieldName( j ).equals( fieldName[ i ] ) ) {
          fieldNumber[ i ] = j;
          break;
        }
      assert j < coll.factory().numberOfFields();
    }
    // Sort fields to guarantee that they are correctly numbered
    Arrays.quickSort( 0, nfields, new AbstractIntComparator() {
      public int compare( int x, int y ) {
        return fieldNumber[ x ] - fieldNumber[ y ];
      }}, new Swapper() {
        public void swap( int x, int y ) {
          int t = fieldNumber[ x ]; fieldNumber[ x ] = fieldNumber[ y ]; fieldNumber[ y ] = t;
          t = arrayIndex[ x ]; arrayIndex[ x ] = arrayIndex[ y ]; arrayIndex[ y ] = t;
          String q = fieldName[ x ]; fieldName[ x ] = fieldName[ y ]; fieldName[ y ] = q;
        }} );
    // Start checking
    for ( int doc = 0; doc < coll.size(); doc++ ) {
      Document docum = coll.document( doc );
      for ( int i = 0; i < nfields; i++ ) {
        int field = fieldNumber[ i ];
        Reader content = (Reader)docum.content( field );
        WordReader wordReader = docum.wordReader( field );
        wordReader.setReader( content );
        StringTokenizer tok = new StringTokenizer( document[ doc ][ arrayIndex[ i ] ] );
        System.err.println( "Checking document " + doc + " field " + fieldName[ i ] + " (" + field + ")" );
        checkSameWords( wordReader, tok );
      }
      docum.close();
    }
  }


  /** Checks that the documents in the sequence have the same sequence of words as in
   *  <code>document</code>: the names of the fields to be checked are specified in the array.
   *  
   * @param seq the sequence.
   * @param fieldName the field names.
   * @param document documents to be checked against.
   * @throws IOException
   */
  private void checkAllDocumentsSeq( final DocumentSequence seq, final String[] fieldName, final String[][] document ) throws IOException {
    final int nfields = fieldName.length;
    final int[] fieldNumber = new int[ nfields ];
    final int[] arrayIndex = new int[ nfields ];
    // Look for field indices
    for ( int i = 0; i < nfields; i++ ) {
      arrayIndex[ i ] = i;
      int j;
      for ( j = 0; j < seq.factory().numberOfFields(); j++ )
        if ( seq.factory().fieldName( j ).equals( fieldName[ i ] ) ) {
          fieldNumber[ i ] = j;
          break;
        }
      assert j < seq.factory().numberOfFields();
    }
    // Sort fields to guarantee that they are correctly numbered
    Arrays.quickSort( 0, nfields, new AbstractIntComparator() {
      public int compare( int x, int y ) {
        return fieldNumber[ x ] - fieldNumber[ y ];
      }}, new Swapper() {
        public void swap( int x, int y ) {
          int t = fieldNumber[ x ]; fieldNumber[ x ] = fieldNumber[ y ]; fieldNumber[ y ] = t;
          t = arrayIndex[ x ]; arrayIndex[ x ] = arrayIndex[ y ]; arrayIndex[ y ] = t;
          String q = fieldName[ x ]; fieldName[ x ] = fieldName[ y ]; fieldName[ y ] = q;
        }} );
    // Start checking
    DocumentIterator iterator = seq.iterator();
    Document docum;
    int doc = 0;
    while ( ( docum = iterator.nextDocument() ) != null ) {
      for ( int i = 0; i < nfields; i++ ) {
        int field = fieldNumber[ i ];
        Reader content = (Reader)docum.content( field );
        WordReader wordReader = docum.wordReader( field );
        wordReader.setReader( content );
        StringTokenizer tok = new StringTokenizer( document[ doc ][ arrayIndex[ i ] ] );
        System.err.println( "Checking sequentially document " + doc + " field " + fieldName[ i ] + " (" + field + ")" );
        checkSameWords( wordReader, tok );
      }
      docum.close();
      doc++;
    }
    iterator.close();
  }


  protected void setUp() throws IOException, ClassNotFoundException, ConfigurationException {
    // Create a new directory under /tmp
    tempDir = File.createTempFile( "mg4jtest", null );
    tempDir.delete();
    tempDir.mkdir();
    // Now create the hierarchy for HTML files
    File htmlDir = new File( tempDir, "html" );
    htmlDir.mkdir();
    System.err.println( "Temporary directory: " + tempDir );
    htmlFileSet = new String[ ndoc ];
    for ( int i = 0; i < ndoc; i++ ) {
      String docFile = new File( htmlDir, "doc" + i + ".html" ).toString();
      htmlFileSet[ i ] = docFile;
      Writer docWriter = new OutputStreamWriter( new FileOutputStream( docFile ), "ISO-8859-1" );
      docWriter.write( getHTMLDocument( document[ i ] ) );
      docWriter.close();
    }
    // Now create the mbox file
    Writer mboxWriter = new OutputStreamWriter( new FileOutputStream( new File( tempDir, "mbox" ) ), "ISO-8859-1" );
    for ( int i = 0; i < ndoc; i++ ) 
      mboxWriter.write( getMboxDocument( document[ i ] ) );
    mboxWriter.close();


    // Now create the zip collections
    FileSetDocumentCollection fileSetDocumentCollection = new FileSetDocumentCollection( htmlFileSet, new HtmlDocumentFactory( DEFAULT_PROPERTIES ) );
    ZipDocumentCollectionBuilder zipCollBuilder = new ZipDocumentCollectionBuilder( new File( tempDir, "zip" ).toString(), 
        fileSetDocumentCollection.factory(), true );
    zipCollBuilder.build( fileSetDocumentCollection );
    
    ZipDocumentCollectionBuilder apprZipCollBuilder = new ZipDocumentCollectionBuilder( new File( tempDir, "azip" ).toString(), 
        fileSetDocumentCollection.factory(), false );
    apprZipCollBuilder.build( fileSetDocumentCollection );
    fileSetDocumentCollection.close();


    // Now create the simple collections
    SimpleCompressedDocumentCollectionBuilder simpleCollBuilder = new SimpleCompressedDocumentCollectionBuilder( new File( tempDir, "simple" ).toString(), 
        fileSetDocumentCollection.factory(), true );
    simpleCollBuilder.build( fileSetDocumentCollection );
    
    SimpleCompressedDocumentCollectionBuilder apprSimpleCollBuilder = new SimpleCompressedDocumentCollectionBuilder( new File( tempDir, "asimple" ).toString(), 
        fileSetDocumentCollection.factory(), false );
    apprSimpleCollBuilder.build( fileSetDocumentCollection );
    fileSetDocumentCollection.close();
  }


  protected void tearDown() throws IOException {
    FileUtils.forceDelete( tempDir );
  }


  public void testFileSetDocumentCollection() throws IOException, ConfigurationException {
    System.err.println( "Checking fileset collection" );
    FileSetDocumentCollection coll = new FileSetDocumentCollection( htmlFileSet, new HtmlDocumentFactory( DEFAULT_PROPERTIES ) );
    assertEquals( coll.size(), ndoc );
    checkAllDocuments( coll, new String[] { "title", "text" }, document );
    coll.close();
  }


  public void testFileSetDocumentCollectionSeq() throws IOException, ConfigurationException {
    System.err.println( "Checking fileset collection sequentially" );
    FileSetDocumentCollection coll = new FileSetDocumentCollection( htmlFileSet, new HtmlDocumentFactory( DEFAULT_PROPERTIES ) );
    checkAllDocumentsSeq( coll, new String[] { "title", "text" }, document );
    coll.close();
  }


  public void testZipDocumentCollection() throws IOException, ClassNotFoundException {
    System.err.println( "Checking zipped collection" );
    ZipDocumentCollection coll = (ZipDocumentCollection)BinIO.loadObject( new File( tempDir, "zip.collection" ).toString() );
    checkAllDocuments( coll, new String[] { "title", "text" }, document );
    coll.close();
  }


  public void testZipDocumentCollectionSeq() throws IOException, ClassNotFoundException {
    System.err.println( "Checking zipped collection sequentially" );
    ZipDocumentCollection coll = (ZipDocumentCollection)BinIO.loadObject( new File( tempDir, "zip.collection" ).toString() );
    checkAllDocumentsSeq( coll, new String[] { "title", "text" }, document );
    coll.close();
  }


  public void testZipDocumentCollectionAppr() throws IOException, ClassNotFoundException {
    System.err.println( "Checking approximated zipped collection" );
    ZipDocumentCollection coll = (ZipDocumentCollection)BinIO.loadObject( new File( tempDir, "azip.collection" ).toString() );
    checkAllDocuments( coll, new String[] { "title", "text" }, document );
    coll.close();
  }


  public void testZipDocumentCollectionApprSeq() throws IOException, ClassNotFoundException {
    System.err.println( "Checking approximated zipped collection sequentially" );
    ZipDocumentCollection coll = (ZipDocumentCollection)BinIO.loadObject( new File( tempDir, "azip.collection" ).toString() );
    checkAllDocumentsSeq( coll, new String[] { "title", "text" }, document );
    coll.close();
  }


  public void testSimpleCompressedDocumentCollection() throws IOException, ClassNotFoundException {
    System.err.println( "Checking simple compressed collection" );
    SimpleCompressedDocumentCollection coll = (SimpleCompressedDocumentCollection)BinIO.loadObject( new File( tempDir, "simple.collection" ).toString() );
    checkAllDocuments( coll, new String[] { "title", "text" }, document );
    coll.close();
  }


  public void testSimpleCompressedDocumentCollectionSeq() throws IOException, ClassNotFoundException {
    System.err.println( "Checking simple compressed collection sequentially" );
    SimpleCompressedDocumentCollection coll = (SimpleCompressedDocumentCollection)BinIO.loadObject( new File( tempDir, "simple.collection" ).toString() );
    checkAllDocumentsSeq( coll, new String[] { "title", "text" }, document );
    coll.close();
  }


  public void testSimpleCompressedDocumentCollectionAppr() throws IOException, ClassNotFoundException {
    System.err.println( "Checking approximated simple compressed collection" );
    SimpleCompressedDocumentCollection coll = (SimpleCompressedDocumentCollection)BinIO.loadObject( new File( tempDir, "asimple.collection" ).toString() );
    checkAllDocuments( coll, new String[] { "title", "text" }, document );
    coll.close();
  }


  public void testSimpleCompressedDocumentCollectionApprSeq() throws IOException, ClassNotFoundException {
    System.err.println( "Checking approximated simple compressed collection sequentially" );
    SimpleCompressedDocumentCollection coll = (SimpleCompressedDocumentCollection)BinIO.loadObject( new File( tempDir, "asimple.collection" ).toString() );
    checkAllDocumentsSeq( coll, new String[] { "title", "text" }, document );
    coll.close();
  }


  public void testConcatenated() throws IOException, ClassNotFoundException {
    SimpleCompressedDocumentCollection coll0 = (SimpleCompressedDocumentCollection)BinIO.loadObject( new File( tempDir, "asimple.collection" ).toString() );
    SimpleCompressedDocumentCollection coll1 = (SimpleCompressedDocumentCollection)BinIO.loadObject( new File( tempDir, "asimple.collection" ).toString() );
    
    ConcatenatedDocumentCollection concatenatedDocumentCollection = new ConcatenatedDocumentCollection( new String[] { new File( tempDir, "asimple.collection" ).toString(), new File( tempDir, "asimple.collection" ).toString() } );
    ConcatenatedDocumentSequence concatenatedDocumentSequence0 = new ConcatenatedDocumentSequence( coll0, coll1 );
    ConcatenatedDocumentSequence concatenatedDocumentSequence1 = new ConcatenatedDocumentSequence( new File( tempDir, "asimple.collection" ).toString(), new File( tempDir, "asimple.collection" ).toString() );
    checkAllDocumentsSeq( concatenatedDocumentSequence0, new String[] { "title", "text" }, document2 );
    checkAllDocumentsSeq( concatenatedDocumentSequence1, new String[] { "title", "text" }, document2 );
    checkAllDocuments( concatenatedDocumentCollection, new String[] { "title", "text" }, document2 );
    concatenatedDocumentCollection.close();
    concatenatedDocumentSequence0.close();
    concatenatedDocumentSequence0.close();
  }
  
  public void testInputStreamSequence() throws IOException, ConfigurationException {
    System.err.println( "Checking input stream (text field only)" );
    // Extract only field number 1, and write it out with separator '\u0000'
    MutableString res = new MutableString();
    String[][] justSecondField = new String[ ndoc ][ 1 ];
    for ( int i = 0; i < ndoc; i++ ) {
      res.append( document[ i ][ 1 ] + "\u0000" );
      justSecondField[ i ][ 0 ] = document[ i ][ 1 ];
    }
    String resString = res.toString();
    // Write the sequence on a file (in UTF-8)
    Writer resWriter = new OutputStreamWriter( new FileOutputStream( new File( tempDir, "stream" ) ), "UTF-8" );
    resWriter.write( resString );
    resWriter.close();
    // Read it as a input stream document sequence
    InputStream is = new FileInputStream( new File( tempDir, "stream" ) );
    DocumentSequence seq = new InputStreamDocumentSequence( is, '\u0000', new IdentityDocumentFactory( DEFAULT_PROPERTIES ) );
    checkAllDocumentsSeq( seq, new String[] { "text" }, justSecondField );
    seq.close();
  }




  /*public void testMboxDocumentCollection() throws IOException, ConfigurationException, MessagingException {
    System.err.println( "Checking mbox collection" );
    JavamailDocumentCollection coll = new JavamailDocumentCollection( "mstor:" + tempDir, "mbox", DEFAULT_PROPERTIES );
    checkAllDocuments( coll, new String[] { "subject", "body" }, document );
    coll.close();
  }


  public void testMboxDocumentCollectionSeq() throws IOException, ConfigurationException, MessagingException {
    System.err.println( "Checking mbox collection sequentially" );
    JavamailDocumentCollection coll = new JavamailDocumentCollection( "mstor:" + tempDir, "mbox", DEFAULT_PROPERTIES );
    checkAllDocumentsSeq( coll, new String[] { "subject", "body" }, document );
    coll.close();
  }*/
}
Source Code of it.unimi.dsi.mg4j.document.DocumentCollectionTest

Related Classes of it.unimi.dsi.mg4j.document.DocumentCollectionTest