Package org.archive.modules.extractor

Source Code of org.archive.modules.extractor.ContentExtractorTestBase

/*
*  This file is part of the Heritrix web crawler (crawler.archive.org).
*
*  Licensed to the Internet Archive (IA) by one or more individual
*  contributors.
*
*  The IA licenses this file to You under the Apache License, Version 2.0
*  (the "License"); you may not use this file except in compliance with
*  the License.  You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
*  Unless required by applicable law or agreed to in writing, software
*  distributed under the License is distributed on an "AS IS" BASIS,
*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*  See the License for the specific language governing permissions and
*  limitations under the License.
*/
package org.archive.modules.extractor;


import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;

import org.archive.modules.CrawlURI;
import org.archive.modules.ProcessorTestBase;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.archive.state.ModuleTestBase;
import org.archive.util.Recorder;


/**
* Abstract base class for unit testing ContentExtractor implementations.
*
* @author pjack
*/
public abstract class ContentExtractorTestBase extends ProcessorTestBase {

   
    /**
     * An extractor created during the setUp.
     */
    protected Extractor extractor;

   
    /**
     * Sets up the {@link #extractor} and
     * {@link ModuleTestBase#processorClass}
     * fields.
     */
    final public void setUp() {
        extractor = makeExtractor();
    }
   
   
    @Override
    protected Object makeModule() {
        return makeExtractor();
    }
       
   
    /**
     * Subclasses should return an Extractor instance to test.
     *
     * @return   an Extractor instance to test
     */
    protected abstract Extractor makeExtractor();
   
   
    /**
     * Returns a CrawlURI for testing purposes.
     *
     * @return   a CrawlURI
     * @throws Exception   just in case
     */
    protected CrawlURI defaultURI() throws Exception {
        UURI uuri = UURIFactory.getInstance("http://www.archive.org/start/");
        return new CrawlURI(uuri, null, null, LinkContext.NAVLINK_MISC);
    }
   
   
    /**
     * Tests that a URI with a zero content length has no links extracted.
     *
     * @throws Exception   just in case
     */
    public void testZeroContent() throws Exception {
        CrawlURI uri = defaultURI();
        Recorder recorder = createRecorder("");
        uri.setContentType("text/plain");
        uri.setRecorder(recorder);
        extractor.process(uri);
        assertEquals(0, uri.getOutLinks().size());
        assertNoSideEffects(uri);
    }
   
   
    /**
     * Tests that a URI whose linkExtractionFinished flag has been set has
     * no links extracted.
     *
     * @throws Exception   just in case
     */
    public void testFinished() throws Exception {
        CrawlURI uri = defaultURI();
        uri.linkExtractorFinished();
        extractor.process(uri);
        assertEquals(0, uri.getOutLinks().size());
        assertNoSideEffects(uri);       
    }

   
    /**
     * Asserts that the given URI has no URI errors, no localized errors, and
     * no annotations.
     *
     * @param uri   the URI to test
     */
    protected static void assertNoSideEffects(CrawlURI uri) {
        assertEquals(0, uri.getNonFatalFailures().size());
        assertTrue(uri.getAnnotations().isEmpty());
    }
   
    @Deprecated
    public static Recorder createRecorder(String content) throws IOException {
        return createRecorder(content, Charset.defaultCharset().name());
    }
   
    public static Recorder createRecorder(String content, String charset)
            throws IOException {
        File temp = File.createTempFile("test", ".tmp");
        Recorder recorder = new Recorder(temp, 1024, 1024);
        byte[] b = content.getBytes(charset);
        ByteArrayInputStream bais = new ByteArrayInputStream(b);
        InputStream is = recorder.inputWrap(bais);
        recorder.markContentBegin();
        for (int x = is.read(); x >= 0; x = is.read());
        is.close();
        return recorder;
    }

}
TOP

Related Classes of org.archive.modules.extractor.ContentExtractorTestBase

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.