Package org.apache.any23.extractor.html

Examples of org.apache.any23.extractor.html.HReviewAggregateExtractorFactory


    private ByteArrayOutputStream byteOutStream = new ByteArrayOutputStream();

    public WebResponder(Servlet any23servlet, HttpServletResponse response) {
        this.any23servlet = any23servlet;
        this.response = response;
        this.runner = new Any23();
        runner.setHTTPUserAgent("Any23-Servlet");
    }
View Full Code Here


            String format,
            boolean report, boolean annotate
    ) throws IOException {
        if (in == null) return;
        if (!initRdfWriter(format, report, annotate)) return;
        final ExtractionReport er;
        try {
            er = runner.extract(eps, in, rdfWriter);
            rdfWriter.close();
            if (! er.hasMatchingExtractors() ) {
                sendError(
                        415,
                        "No suitable extractor found for this media type",
                        null,
                        er,
                        report
                );
                return;
            }
        } catch (IOException ioe) {
            // IO Error.
            if (ioe.getCause() != null && ValidatorException.class.equals(ioe.getCause().getClass())) {
                final String errMsg = "Could not fetch input, IO Error.";
                any23servlet.log(errMsg, ioe.getCause());
                sendError(502, errMsg, ioe, null, report);
                return;
            }
            any23servlet.log("Could not fetch input", ioe);
            sendError(502, "Could not fetch input.", ioe, null, report);
            return;
        } catch (ExtractionException e) {
            // Extraction error.
            any23servlet.log("Could not parse input", e);
            sendError(502, "Could not parse input.", e, null, report);
            return;
        } catch (Exception e) {
            any23servlet.log("Internal error", e);
            sendError(500, "Internal error.", e, null, report);
            return;
        }

        /* *** No triples found. *** */
        any23servlet.log("Extraction complete, " + reporter.getTotalTriples() + " triples");
        if (reporter.getTotalTriples() == 0) {
            sendError(
                    501,
                    "Extraction completed. No triples have been found.",
                    null,
                    er, report
            );
            return;
        }

        // Regular response.
        response.setContentType(outputMediaType);
        response.setStatus(200);
        // Set the output encoding equals to the input one.
        final String charsetEncoding = er.getEncoding();
        if (Charset.isSupported(charsetEncoding)) {
            response.setCharacterEncoding(er.getEncoding());
        } else {
            response.setCharacterEncoding("UTF-8");
        }

        final ServletOutputStream sos = response.getOutputStream();
View Full Code Here

     */
    @Test
    public void testDetectCLIPlugins() throws IOException {
        final Iterator<Tool> tools = manager.getApplicableTools(CRAWLER_TARGET_DIR, CRAWLER_DEPENDENCY_DIR);
        final Set<String> toolClasses = new HashSet<String>();
        Tool tool;
        while(tools.hasNext()) {
            tool = tools.next();
            assertTrue("Found duplicate tool.", toolClasses.add(tool.getClass().getName()));
        }
        assertTrue(
                String.format(
                        "Expected [%s] plugin be detected, but not found int the built classpath",
                        Crawler.class.getName()
View Full Code Here

        final CompositeTripleHandler cth = new CompositeTripleHandler();
        cth.addChild(rdfxmlWriter);
        cth.addChild(repositoryWriter);

        final ModifiableConfiguration configuration = DefaultConfiguration.copy();
        configuration.setProperty("any23.extraction.metadata.domain.per.entity", "on");
        SingleDocumentExtraction instance =  new SingleDocumentExtraction(
                configuration,
                new HTMLFixture(copyResourceToTempFile(file)).getOpener("http://nested.test.com"),
                extractorGroup,
                cth
View Full Code Here

        logger.debug(n3);
    }

    @Test
    public void testModifiableConfiguration_issue183() throws Exception {
        final ModifiableConfiguration modifiableConf = DefaultConfiguration.copy();
        modifiableConf.setProperty("any23.extraction.metadata.timesize", "off");
        final Any23 any23 = new Any23(modifiableConf);

        final String content = FileUtils.readResourceContent("/rdf/rdf-issue183.ttl");
        final DocumentSource source = new StringDocumentSource(content, "http://base.com");
        final ByteArrayOutputStream out = new ByteArrayOutputStream();
View Full Code Here

    public void testTypedLiteralIncompatibleValueSupport()
    throws IOException, ExtractionException, TripleHandlerException {
        final URI uri = RDFUtils.uri("http://host.com/test-malformed-literal.turtle");
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        final TripleHandler th = new RDFXMLWriter(baos);
        final ExtractionContext extractionContext = new ExtractionContext("turtle-extractor", uri);
        final ExtractionResult result = new ExtractionResultImpl(extractionContext, extractor, th);
        extractor.setStopAtFirstError(false);
        try {
            extractor.run(
                    ExtractionParameters.newDefault(),
View Full Code Here

        InputStream input = new BufferedInputStream(this.getClass().getResourceAsStream(filename));
       
        Document document = new TagSoupParser(input, baseURI.stringValue()).getDOM();
        HCardExtractor hCardExtractor = new HCardExtractorFactory().createExtractor();
        ExtractionContext hcExtractionContext = new ExtractionContext(
                hCardExtractor.getDescription().getExtractorName(),
                baseURI
        );
        hCardExtractor.run(
                ExtractionParameters.newDefault(),
                hcExtractionContext,
                document,
                new ExtractionResultImpl(
                        hcExtractionContext,
                        hCardExtractor,
                        new RepositoryWriter(getConnection())
                )
        );
        XFNExtractor xfnExtractor = new XFNExtractorFactory().createExtractor();
        ExtractionContext xfnExtractionContext = new ExtractionContext(
                xfnExtractor.getDescription().getExtractorName(),
                baseURI
        );
        xfnExtractor.run(
                        ExtractionParameters.newDefault(),
View Full Code Here

       
        InputStream input = new BufferedInputStream(this.getClass().getResourceAsStream(filename));

        Document document = new TagSoupParser(input, baseURI.stringValue()).getDOM();
        HCardExtractor hCardExtractor = new HCardExtractorFactory().createExtractor();
        ExtractionContext hCardExtractionContext = new ExtractionContext(
                hCardExtractor.getDescription().getExtractorName(), baseURI
        );
        hCardExtractor.run(
                ExtractionParameters.newDefault(),
                hCardExtractionContext,
                document,
                new ExtractionResultImpl(
                        hCardExtractionContext,
                        hCardExtractor, new RepositoryWriter(getConnection())
                )
        );

        GeoExtractor geoExtractor = new GeoExtractorFactory().createExtractor();
        ExtractionContext geoExtractionContext = new ExtractionContext(
                geoExtractor.getDescription().getExtractorName(), baseURI
        );
        geoExtractor.run(
                ExtractionParameters.newDefault(),
                geoExtractionContext,
                document,
                new ExtractionResultImpl(
                        geoExtractionContext,
                        geoExtractor,
                        new RepositoryWriter(getConnection())
                )
        );

        AdrExtractor adrExtractor = new AdrExtractorFactory().createExtractor();
        ExtractionContext adrExtractionContext = new ExtractionContext(
                adrExtractor.getDescription().getExtractorName(), baseURI
        );
        adrExtractor.run(
                ExtractionParameters.newDefault(),
                adrExtractionContext,
View Full Code Here

    private void extractHRevAndRelated(String filename) throws ExtractionException, IOException {
        extractHCardAndRelated(filename);
        InputStream input = new BufferedInputStream(this.getClass().getResourceAsStream(filename));
        Document document = new TagSoupParser(input, baseURI.stringValue()).getDOM();
        HReviewExtractor hReviewExtractor = new HReviewExtractorFactory().createExtractor();
        ExtractionContext hreviewExtractionContext = new ExtractionContext(
                hReviewExtractor.getDescription().getExtractorName(), baseURI
        );
        hReviewExtractor.run(
                ExtractionParameters.newDefault(),
                hreviewExtractionContext,
View Full Code Here

    @Test
    public void testRun() throws IOException, ExtractionException {
        final InputStream is = this.getClass().getResourceAsStream("html-scraper-extractor-test.html");
        final ExtractionResult extractionResult = mock(ExtractionResult.class);
        final URI pageURI = ValueFactoryImpl.getInstance().createURI("http://fake/test/page/testrun");
        final ExtractionContext extractionContext = new ExtractionContext(
                extractor.getDescription().getExtractorName(),
                pageURI
        );
        extractor.run(ExtractionParameters.newDefault(), extractionContext, is, extractionResult);
View Full Code Here

TOP

Related Classes of org.apache.any23.extractor.html.HReviewAggregateExtractorFactory

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.