Package io.lumify.tikaTextExtractor

Source Code of io.lumify.tikaTextExtractor.TikaTextExtractorGraphPropertyWorkerTest

package io.lumify.tikaTextExtractor;

import io.lumify.core.config.HashMapConfigurationLoader;
import io.lumify.core.ingest.graphProperty.GraphPropertyWorkData;
import io.lumify.core.ingest.graphProperty.GraphPropertyWorkerPrepareData;
import io.lumify.core.model.audit.AuditRepository;
import io.lumify.core.model.properties.LumifyProperties;
import io.lumify.core.model.workQueue.WorkQueueRepository;
import org.apache.commons.io.IOUtils;
import org.junit.Before;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.mockito.Mock;
import org.mockito.runners.MockitoJUnitRunner;
import org.securegraph.*;
import org.securegraph.inmemory.InMemoryAuthorizations;
import org.securegraph.inmemory.InMemoryGraph;
import org.securegraph.property.StreamingPropertyValue;

import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;

import static org.junit.Assert.assertEquals;

@RunWith(MockitoJUnitRunner.class)
public class TikaTextExtractorGraphPropertyWorkerTest {
    private Graph graph;
    private Visibility visibility;
    private Authorizations authorizations;
    private TikaTextExtractorGraphPropertyWorker textExtractor;

    @Mock
    private WorkQueueRepository workQueueRepository;

    @Mock
    private AuditRepository auditRepository;

    @Before
    public void before() throws Exception {
        graph = new InMemoryGraph();
        visibility = new Visibility("");
        authorizations = new InMemoryAuthorizations();
        textExtractor = new TikaTextExtractorGraphPropertyWorker();

        Map config = new HashMap();
        config.put(io.lumify.core.config.Configuration.ONTOLOGY_IRI_PERSON, "http://lumify.io/test#person");
        config.put(io.lumify.core.config.Configuration.ONTOLOGY_IRI_LOCATION, "http://lumify.io/test#location");
        config.put(io.lumify.core.config.Configuration.ONTOLOGY_IRI_ORGANIZATION, "http://lumify.io/test#organization");
        config.put(io.lumify.core.config.Configuration.ONTOLOGY_IRI_ARTIFACT_HAS_ENTITY, "http://lumify.io/test#artifactHasEntity");
        io.lumify.core.config.Configuration configuration = new HashMapConfigurationLoader(config).createConfiguration();

        GraphPropertyWorkerPrepareData prepareData = new GraphPropertyWorkerPrepareData(config, null, null, null, null, null);
        textExtractor.setConfiguration(configuration);
        textExtractor.setGraph(graph);
        textExtractor.setWorkQueueRepository(workQueueRepository);
        textExtractor.setAuditRepository(auditRepository);
        textExtractor.prepare(prepareData);
    }

    @Test
    public void testExtractWithHtml() throws Exception {
        String data = "<html>";
        data += "<head>";
        data += "<title>Test Title</title>";
        data += "<meta content=\"2013-01-01T18:09:20Z\" itemprop=\"datePublished\" name=\"pubdate\"/>";
        data += "</head>";
        data += "<body>";
        data += "<div><table><tr><td>Menu1</td><td>Menu2</td><td>Menu3</td></tr></table></div>\n";
        data += "\n";
        data += "<h1>Five reasons why Windows 8 has failed</h1>\n";
        data += "<p>The numbers speak for themselves. Vista, universally acknowledged as a failure, actually had significantly better adoption numbers than Windows 8. At similar points in their roll-outs, Vista had a desktop market share of 4.52% compared to Windows 8's share of 2.67%. Underlining just how poorly Windows 8's adoption has gone, Vista didn't even have the advantage of holiday season sales to boost its numbers. Tablets--and not Surface RT tablets--were what people bought last December, not Windows 8 PCs.</p>\n";
        data += "</body>";
        data += "</html>";
        createVertex(data, "text/html");

        InputStream in = new ByteArrayInputStream(data.getBytes());
        Vertex vertex = graph.getVertex("v1", authorizations);
        Property property = vertex.getProperty(LumifyProperties.RAW.getPropertyName());
        GraphPropertyWorkData workData = new GraphPropertyWorkData(vertex, property, null, null);
        textExtractor.execute(in, workData);

        vertex = graph.getVertex("v1", authorizations);
        assertEquals("Test Title", LumifyProperties.TITLE.getPropertyValue(vertex));

        assertEquals(
                "Five reasons why Windows 8 has failed\n" +
                        "The numbers speak for themselves. Vista, universally acknowledged as a failure, actually had significantly better adoption numbers than Windows 8. At similar points in their roll-outs, Vista had a desktop market share of 4.52% compared to Windows 8's share of 2.67%. Underlining just how poorly Windows 8's adoption has gone, Vista didn't even have the advantage of holiday season sales to boost its numbers. Tablets--and not Surface RT tablets--were what people bought last December, not Windows 8 PCs.\n",
                IOUtils.toString(LumifyProperties.TEXT.getPropertyValue(vertex).getInputStream(), "UTF-8")
        );
        assertEquals(new Date(1357063760000L), LumifyProperties.CREATE_DATE.getPropertyValue(vertex));
    }

    private void createVertex(String data, String mimeType) throws UnsupportedEncodingException {
        VertexBuilder v = graph.prepareVertex("v1", visibility);
        StreamingPropertyValue textValue = new StreamingPropertyValue(new ByteArrayInputStream(data.getBytes("UTF-8")), byte[].class);
        textValue.searchIndex(false);
        Map<String, Object> metadata = new HashMap<String, Object>();
        metadata.put(LumifyProperties.MIME_TYPE.getPropertyName(), mimeType);
        LumifyProperties.RAW.setProperty(v, textValue, metadata, visibility);
        v.save(authorizations);
    }

    @Test
    public void testExtractWithEmptyHtml() throws Exception {
        String data = "<html>";
        data += "<head>";
        data += "<title>Test Title</title>";
        data += "<meta content=\"2013-01-01T18:09:20Z\" itemprop=\"datePublished\" name=\"pubdate\"/>";
        data += "</head>";
        data += "<body>";
        data += "</body>";
        data += "</html>";
        createVertex(data, "text/html");

        InputStream in = new ByteArrayInputStream(data.getBytes());
        Vertex vertex = graph.getVertex("v1", authorizations);
        Property property = vertex.getProperty(LumifyProperties.RAW.getPropertyName());
        GraphPropertyWorkData workData = new GraphPropertyWorkData(vertex, property, null, null);
        textExtractor.execute(in, workData);

        vertex = graph.getVertex("v1", authorizations);
        assertEquals("Test Title", LumifyProperties.TITLE.getPropertyValue(vertex));
        assertEquals("", IOUtils.toString(LumifyProperties.TEXT.getPropertyValue(vertex).getInputStream(), "UTF-8"));
        assertEquals(new Date(1357063760000L), LumifyProperties.CREATE_DATE.getPropertyValue(vertex));
    }

    @Test
    public void testExtractWithNotHtml() throws Exception {
        String data = "<title>Test Title</title>";
        data += "<meta content=\"2013-01-01T18:09:20Z\" itemprop=\"datePublished\" name=\"pubdate\"/>";
        data += "<h1>Five reasons why Windows 8 has failed</h1>";
        data += "<p>The numbers speak for themselves. Vista, universally acknowledged as a failure, actually had significantly better adoption numbers than Windows 8. At similar points in their roll-outs, Vista had a desktop market share of 4.52% compared to Windows 8's share of 2.67%. Underlining just how poorly Windows 8's adoption has gone, Vista didn't even have the advantage of holiday season sales to boost its numbers. Tablets--and not Surface RT tablets--were what people bought last December, not Windows 8 PCs.</p>";
        data += "</body>";
        data += "</html>";
        createVertex(data, "text/html");

        InputStream in = new ByteArrayInputStream(data.getBytes());
        Vertex vertex = graph.getVertex("v1", authorizations);
        Property property = vertex.getProperty(LumifyProperties.RAW.getPropertyName());
        GraphPropertyWorkData workData = new GraphPropertyWorkData(vertex, property, null, null);
        textExtractor.execute(in, workData);

        vertex = graph.getVertex("v1", authorizations);
        assertEquals("Test Title", LumifyProperties.TITLE.getPropertyValue(vertex));
        assertEquals(
                "Five reasons why Windows 8 has failed\n" +
                        "The numbers speak for themselves. Vista, universally acknowledged as a failure, actually had significantly better adoption numbers than Windows 8. At similar points in their roll-outs, Vista had a desktop market share of 4.52% compared to Windows 8's share of 2.67%. Underlining just how poorly Windows 8's adoption has gone, Vista didn't even have the advantage of holiday season sales to boost its numbers. Tablets--and not Surface RT tablets--were what people bought last December, not Windows 8 PCs.\n",
                IOUtils.toString(LumifyProperties.TEXT.getPropertyValue(vertex).getInputStream(), "UTF-8")
        );
        assertEquals(new Date(1357063760000L), LumifyProperties.CREATE_DATE.getPropertyValue(vertex));
    }

    @Test
    public void testExtractTextWithAccentCharacters() throws Exception {
        String data = "the Quita Suena\u0301 bank";
        createVertex(data, "text/plain; charset=utf-8");

        InputStream in = new ByteArrayInputStream(data.getBytes("UTF-8"));
        Vertex vertex = graph.getVertex("v1", authorizations);
        Property property = vertex.getProperty(LumifyProperties.RAW.getPropertyName());
        GraphPropertyWorkData workData = new GraphPropertyWorkData(vertex, property, null, null);
        textExtractor.execute(in, workData);

        vertex = graph.getVertex("v1", authorizations);
        String expected = "the Quita Suená bank ";
        String actual = IOUtils.toString(LumifyProperties.TEXT.getPropertyValue(vertex).getInputStream(), "UTF-8");
        assertEquals(21, expected.length());
        assertEquals(expected, actual);
        assertEquals(expected.length(), actual.length());
    }

    //todo : add test with image metadata
}
TOP

Related Classes of io.lumify.tikaTextExtractor.TikaTextExtractorGraphPropertyWorkerTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.