Package org.archive.modules.recrawl.wbm

Source Code of org.archive.modules.recrawl.wbm.WbmPersistLoadProcessorTest$LoadTask

package org.archive.modules.recrawl.wbm;

import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URLEncoder;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;

import junit.framework.TestCase;

import org.apache.http.HttpResponse;
import org.apache.http.ProtocolVersion;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.entity.ByteArrayEntity;
import org.apache.http.message.BasicHttpResponse;
import org.apache.http.message.BasicStatusLine;
import org.archive.modules.CoreAttributeConstants;
import org.archive.modules.CrawlURI;
import org.archive.modules.ProcessResult;
import org.archive.modules.recrawl.FetchHistoryHelper;
import org.archive.modules.recrawl.FetchHistoryProcessor;
import org.archive.modules.recrawl.RecrawlAttributeConstants;
import org.archive.net.UURIFactory;
import org.archive.util.Base32;
import org.archive.util.DateUtils;
import org.easymock.EasyMock;

import com.google.common.util.concurrent.ExecutionList;

/**
* unit test for {@link WbmPersistLoadProcessor}.
*
* TODO:
* <ul>
* <li>test for pathological cases: illegal chars, incorrect length, etc.
* <li>test if connection is properly released back to the pool for all possible cases.
* </ul>
* @author kenji
*
*/
public class WbmPersistLoadProcessorTest extends TestCase {

  public void testBuildURL() throws Exception {
    WbmPersistLoadProcessor t = new WbmPersistLoadProcessor();
    t.setQueryURL("http://web.archive.org/cdx/search/cdx?url=$u&startDate=$s&limit=1");
    final String URL = "http://archive.org/";
    String url = t.buildURL(URL);
    System.err.println(url);
    assertTrue("has encode URL", Pattern.matches(".*[&?]url="+URLEncoder.encode(URL, "UTF-8")+"([&].*)?", url));
    assertTrue("has startDate", Pattern.matches(".*[&?]startDate=\\d{14}([&].*)?", url));
    assertTrue("has limit", Pattern.matches(".*[&?]limit=\\d+([&].*)?", url));
    //assertTrue("has last=true", Pattern.matches(".*[&?]last=true([&].*)?", url));
  }
 
  /**
   * stub HttpResponse for normal case.
   * @author kenji
   *
   */
  public static class TestNormalHttpResponse extends BasicHttpResponse {
    public static final String EXPECTED_TS = "20121101155310";
    public static final String EXPECTED_HASH = "GHN5VKF3TBKNSEZTASOM23BJRTKFFNJK";
    public TestNormalHttpResponse() {
      super(new BasicStatusLine(new ProtocolVersion("HTTP", 1, 0), 200, "OK"));
      setEntity(new ByteArrayEntity(
    ("org,archive)/ "+EXPECTED_TS+" http://archive.org/ text/html 200 "+
        EXPECTED_HASH+" - - 6908 982548871 "+
        "google.es-20121101-155506/IA-FOC-google.es-20121101073708-00001.warc.gz\n"
    ).getBytes()
      ));
    }
  }
 
  protected Map<String, Object> getFetchHistory(CrawlURI curi, int idx) {
    Map<String, Object> data = curi.getData();
    @SuppressWarnings("unchecked")
    Map<String, Object>[] historyArray = (Map[])data.get(RecrawlAttributeConstants.A_FETCH_HISTORY);
    assertNotNull(historyArray);
    Map<String, Object> history = historyArray[idx];
    return history;
  }
 
  private byte[] sha1Digest(String text) throws NoSuchAlgorithmException {
      MessageDigest md = MessageDigest.getInstance("sha1");
      return md.digest(text.getBytes());
  }

  /**
   * this test is disabled because it talks to production CDX server.
   * @throws Exception
   */
  public void _testInnerProcessResultSingleShotWithMock() throws Exception {
    // because AbstractHttpClient marks most execute(...) methods final, it is very tiresome to implement
    // a stub by implementing HttpClient interface. So I use EasyMock.
    HttpClient client = EasyMock.createMock(HttpClient.class);
    HttpResponse testResponse = new TestNormalHttpResponse();
    EasyMock.expect(client.execute((HttpUriRequest)EasyMock.notNull())).andReturn(testResponse);
    EasyMock.replay(client);
   
    final String CONTENT_DIGEST_SCHEME = "sha1:";
    WbmPersistLoadProcessor t = new WbmPersistLoadProcessor();
    t.setHttpClient(client);
    t.setContentDigestScheme(CONTENT_DIGEST_SCHEME);
    CrawlURI curi = new CrawlURI(UURIFactory.getInstance("http://archive.org/"));
   
    // put history entry newer than being loaded (i.e. loaded history entry will not be used for FetchHistoryProcessor
    // check below.
    long expected_ts = DateUtils.parse14DigitDate(TestNormalHttpResponse.EXPECTED_TS).getTime();
    Map<String, Object>[] fetchHistory = (Map[])curi.getData().get(RecrawlAttributeConstants.A_FETCH_HISTORY);
    if (fetchHistory == null) {
      fetchHistory = new HashMap[2];
      curi.getData().put(RecrawlAttributeConstants.A_FETCH_HISTORY, fetchHistory);
    }
    final byte[] digestValue0 = sha1Digest("0");
    final byte[] digestValue1 = sha1Digest("1");
    fetchHistory[0] = new HashMap<String, Object>();
    fetchHistory[0].put(FetchHistoryHelper.A_TIMESTAMP, expected_ts + 2000);
    fetchHistory[0].put(CoreAttributeConstants.A_FETCH_BEGAN_TIME, expected_ts + 2000);
    fetchHistory[0].put(RecrawlAttributeConstants.A_CONTENT_DIGEST,
        CONTENT_DIGEST_SCHEME + Base32.encode(digestValue0));
    fetchHistory[1] = new HashMap<String, Object>();
    fetchHistory[1].put(FetchHistoryHelper.A_TIMESTAMP, expected_ts - 2000);
    fetchHistory[1].put(RecrawlAttributeConstants.A_CONTENT_DIGEST,
        CONTENT_DIGEST_SCHEME + Base32.encode(digestValue1));
   
    ProcessResult result = t.innerProcessResult(curi);
    assertEquals("result is PROCEED", ProcessResult.PROCEED, result);
   
    // newly loaded history entry should fall in between two existing entries (index=1)
    Map<String, Object> history = getFetchHistory(curi, 1);
    assertNotNull("history", history);
    String hash = (String)history.get(RecrawlAttributeConstants.A_CONTENT_DIGEST);
    assertEquals("CONTENT_DIGEST", CONTENT_DIGEST_SCHEME+TestNormalHttpResponse.EXPECTED_HASH, hash);
   
    Long ts = (Long)history.get(FetchHistoryHelper.A_TIMESTAMP);
    assertNotNull("ts is non-null", ts);
    assertEquals("'ts' has expected timestamp", expected_ts, ts.longValue());

    // Check compatibility with FetchHistoryProcessor.
    // TODO: This is not testing WbmPersistLoadProcessor - only testing stub fetchHistory
    // setup above (OK as long as it matches WbmPersistLoadProcessor). We need a separate
    // test method.
    curi.setFetchStatus(200);
    curi.setFetchBeginTime(System.currentTimeMillis());
    // FetchHistoryProcessor once failed for a revisit case. We'd need to test other cases
    // too (TODO).
    curi.setContentDigest("sha1", digestValue0);
    FetchHistoryProcessor fhp = new FetchHistoryProcessor();
    fhp.process(curi);
  }
 
  public void testInnerProcessResultSingleShotWithRealServer() throws Exception {
    WbmPersistLoadProcessor t = new WbmPersistLoadProcessor();
    //CrawlURI curi = new CrawlURI(UURIFactory.getInstance("http://archive.org/"));
    CrawlURI curi = new CrawlURI(UURIFactory.getInstance("http://www.mext.go.jp/null.gif"));
    ProcessResult result = t.innerProcessResult(curi);
    Map<String, Object> history = getFetchHistory(curi, 0);
    assertNotNull("getFetchHistory returns non-null", history);
    String hash = (String)history.get(RecrawlAttributeConstants.A_CONTENT_DIGEST);
    assertNotNull("CONTENT_DIGEST is non-null", hash);
    assertTrue("CONTENT_DIGEST starts with scheme", hash.startsWith(t.getContentDigestScheme()));
    assertEquals("CONTENT_DIGEST is a String of length 32", 32, hash.substring(t.getContentDigestScheme().length()).length());
   
    assertEquals("should always return PROCEED", ProcessResult.PROCEED, result);
  }
 
  public static class LoadTask implements Runnable {
    private WbmPersistLoadProcessor p;
    private String uri;
    public LoadTask(WbmPersistLoadProcessor p, String uri) {
      this.p = p;
      this.uri = uri;
    }
    @Override
    public void run() {
      try {
      CrawlURI curi = new CrawlURI(UURIFactory.getInstance(this.uri));
      p.innerProcessResult(curi);
      //System.err.println(curi.toString());
      } catch (Exception ex) {
      ex.printStackTrace();
      }
    }
  }
 
  /**
   * test for performance.
   * not named as test case. run this through main().
   * @throws Exception
   */
  public void measureInnerProcessResultMany() throws Exception {
    final WbmPersistLoadProcessor t = new WbmPersistLoadProcessor();
    InputStream is = getClass().getResourceAsStream("/test-url-list.txt");
    BufferedReader br = new BufferedReader(new InputStreamReader(is));
    String line;
    int nurls = 0;
    ExecutorService executor = Executors.newFixedThreadPool(100);
    ExecutionList tasks = new ExecutionList();
    while ((line = br.readLine()) != null) {
      tasks.add(new LoadTask(t, line), executor);
      nurls++;
    }
    long t0 = System.currentTimeMillis();
    tasks.execute();
    executor.awaitTermination(30, TimeUnit.SECONDS);
    long el = System.currentTimeMillis() - t0;
    System.err.println(nurls + " urls, time=" + el + "ms (" + (nurls / (el / 1000.0)) + " URI/s");
  }
 
  public static void main() throws Exception {
    (new WbmPersistLoadProcessorTest()).measureInnerProcessResultMany();
  }
}
TOP

Related Classes of org.archive.modules.recrawl.wbm.WbmPersistLoadProcessorTest$LoadTask

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.