Package edu.umd.cloud9.integration.collection.trecweb

Source Code of edu.umd.cloud9.integration.collection.trecweb.IntegrationTest

package edu.umd.cloud9.integration.collection.trecweb;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;

import java.util.Random;

import junit.framework.JUnit4TestAdapter;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.junit.Test;

import com.google.common.base.Joiner;

import edu.umd.cloud9.collection.DocnoMapping;
import edu.umd.cloud9.collection.trecweb.Gov2DocnoMapping;
import edu.umd.cloud9.collection.trecweb.TrecWebDocumentInputFormat;
import edu.umd.cloud9.collection.trecweb.Wt10gDocnoMapping;
import edu.umd.cloud9.integration.IntegrationUtils;

public class IntegrationTest {
  private static final Random random = new Random();

  private static final Path wt10gPath = new Path("/shared/collections/wt10g/collection.raw");
  private static final Path gov2Path = new Path("/shared/collections/gov2/collection.raw/gov2-corpus");

  private static final String tmpPrefix = "tmp-" + IntegrationTest.class.getCanonicalName() + "-" + random.nextInt(10000);

  // wt10g, repacked.
  @Test
  public void testWt10gDocnoMappingRepacked() throws Exception {
    Configuration conf = IntegrationUtils.getBespinConfiguration();
    FileSystem fs = FileSystem.get(conf);

    assertTrue(fs.exists(wt10gPath));

    String repackedCollection = tmpPrefix + "-wt10g-repacked";
    String wt10gMappingFile = tmpPrefix + "-wt10g-mapping-repacked.dat";

    String[] args = new String[] { "hadoop jar", IntegrationUtils.getJar("dist", "cloud9"),
        edu.umd.cloud9.collection.trecweb.RepackTrecWebCollection.class.getCanonicalName(),
        "-libjars=" + IntegrationUtils.getJar("lib", "guava"),
        "-collection=" + wt10gPath,
        "-output=" + repackedCollection,
        "-compressionType=block" };

    IntegrationUtils.exec(Joiner.on(" ").join(args));

    args = new String[] { "hadoop jar", IntegrationUtils.getJar("dist", "cloud9"),
        edu.umd.cloud9.collection.trecweb.TrecWebDocnoMappingBuilder.class.getCanonicalName(),
        "-libjars=" + IntegrationUtils.getJar("lib", "guava"),
        "-" + DocnoMapping.BuilderUtils.COLLECTION_OPTION + "=" + repackedCollection,
        "-" + DocnoMapping.BuilderUtils.MAPPING_OPTION + "=" + wt10gMappingFile };

    IntegrationUtils.exec(Joiner.on(" ").join(args));

    Wt10gDocnoMapping mapping = new Wt10gDocnoMapping();
    mapping.loadMapping(new Path(wt10gMappingFile), fs);

    assertEquals("WTX001-B01-1", mapping.getDocid(1));
    assertEquals("WTX062-B34-37", mapping.getDocid(1000000));

    assertEquals(1, mapping.getDocno("WTX001-B01-1"));
    assertEquals(1000000, mapping.getDocno("WTX062-B34-37"));
  }

  // gov2, repacked.
  @Test
  public void testGov2DocnoMappingRepacked() throws Exception {
    Configuration conf = IntegrationUtils.getBespinConfiguration();
    FileSystem fs = FileSystem.get(conf);

    assertTrue(fs.exists(gov2Path));

    String repackedCollection = tmpPrefix + "-gov2-repacked";
    String gov2MappingFile = tmpPrefix + "-gov2-mapping-repacked.dat";

    String[] args = new String[] { "hadoop jar", IntegrationUtils.getJar("dist", "cloud9"),
        edu.umd.cloud9.collection.trecweb.RepackTrecWebCollection.class.getCanonicalName(),
        "-libjars=" + IntegrationUtils.getJar("lib", "guava"),
        "-collection=" + gov2Path, "-output=" + repackedCollection, "-compressionType=block" };

    IntegrationUtils.exec(Joiner.on(" ").join(args));

    args = new String[] { "hadoop jar", IntegrationUtils.getJar("dist", "cloud9"),
        edu.umd.cloud9.collection.trecweb.TrecWebDocnoMappingBuilder.class.getCanonicalName(),
        "-libjars=" + IntegrationUtils.getJar("lib", "guava"),
        "-" + DocnoMapping.BuilderUtils.COLLECTION_OPTION + "=" + repackedCollection,
        "-" + DocnoMapping.BuilderUtils.MAPPING_OPTION + "=" + gov2MappingFile };

    IntegrationUtils.exec(Joiner.on(" ").join(args));

    Gov2DocnoMapping mapping = new Gov2DocnoMapping();
    mapping.loadMapping(new Path(gov2MappingFile), fs);

    assertEquals("GX000-00-0000000", mapping.getDocid(1));
    assertEquals("GX210-38-0737901", mapping.getDocid(20000000));

    assertEquals(1, mapping.getDocno("GX000-00-0000000"));
    assertEquals(20000000, mapping.getDocno("GX210-38-0737901"));
  }

  // wt10g, original.
  @Test
  public void testWt10gDocnoMapping() throws Exception {
    Configuration conf = IntegrationUtils.getBespinConfiguration();
    FileSystem fs = FileSystem.get(conf);

    assertTrue(fs.exists(wt10gPath));

    String wt10gMappingFile = tmpPrefix + "-wt10g-mapping.dat";

    String[] args = new String[] { "hadoop jar", IntegrationUtils.getJar("dist", "cloud9"),
        edu.umd.cloud9.collection.trecweb.TrecWebDocnoMappingBuilder.class.getCanonicalName(),
        "-libjars=" + IntegrationUtils.getJar("lib", "guava"),
        "-" + DocnoMapping.BuilderUtils.COLLECTION_OPTION + "=" + wt10gPath,
        "-" + DocnoMapping.BuilderUtils.FORMAT_OPTION + "=" + TrecWebDocumentInputFormat.class.getCanonicalName(),
        "-" + DocnoMapping.BuilderUtils.MAPPING_OPTION + "=" + wt10gMappingFile };

    IntegrationUtils.exec(Joiner.on(" ").join(args));

    Wt10gDocnoMapping mapping = new Wt10gDocnoMapping();
    mapping.loadMapping(new Path(wt10gMappingFile), fs);

    assertEquals("WTX001-B01-1", mapping.getDocid(1));
    assertEquals("WTX062-B34-37", mapping.getDocid(1000000));

    assertEquals(1, mapping.getDocno("WTX001-B01-1"));
    assertEquals(1000000, mapping.getDocno("WTX062-B34-37"));
  }

  // gov2, original.
  @Test
  public void testGov2DocnoMapping() throws Exception {
    Configuration conf = IntegrationUtils.getBespinConfiguration();
    FileSystem fs = FileSystem.get(conf);

    assertTrue(fs.exists(gov2Path));

    String gov2MappingFile = tmpPrefix + "-gov2-mapping.dat";

    String[] args = new String[] { "hadoop jar", IntegrationUtils.getJar("dist", "cloud9"),
        edu.umd.cloud9.collection.trecweb.TrecWebDocnoMappingBuilder.class.getCanonicalName(),
        "-libjars=" + IntegrationUtils.getJar("lib", "guava"),
        "-" + DocnoMapping.BuilderUtils.COLLECTION_OPTION + "=" + gov2Path,
        "-" + DocnoMapping.BuilderUtils.FORMAT_OPTION + "=" + TrecWebDocumentInputFormat.class.getCanonicalName(),
        "-" + DocnoMapping.BuilderUtils.MAPPING_OPTION + "=" + gov2MappingFile };

    IntegrationUtils.exec(Joiner.on(" ").join(args));

    Gov2DocnoMapping mapping = new Gov2DocnoMapping();
    mapping.loadMapping(new Path(gov2MappingFile), fs);

    assertEquals("GX000-00-0000000", mapping.getDocid(1));
    assertEquals("GX210-38-0737901", mapping.getDocid(20000000));

    assertEquals(1, mapping.getDocno("GX000-00-0000000"));
    assertEquals(20000000, mapping.getDocno("GX210-38-0737901"));
  }

  public static junit.framework.Test suite() {
    return new JUnit4TestAdapter(IntegrationTest.class);
  }
}
TOP

Related Classes of edu.umd.cloud9.integration.collection.trecweb.IntegrationTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.