Package org.apache.any23.plugin.crawler

Examples of org.apache.any23.plugin.crawler.SiteCrawler


    public void testGZippedContent() throws IOException, URISyntaxException, ExtractionException {
        assumeOnlineAllowed();

        Any23 runner = new Any23();
        runner.setHTTPUserAgent("test-user-agent");
        HTTPClient httpClient = runner.getHTTPClient();
        DocumentSource source = new HTTPDocumentSource(
                httpClient,
                "http://products.semweb.bestbuy.com/y/products/7590289/"
        );
        ByteArrayOutputStream out = new ByteArrayOutputStream();
View Full Code Here


                configuration,
                new HTMLFixture(copyResourceToTempFile(file)).getOpener("http://nested.test.com"),
                extractorGroup,
                cth
        );
        instance.setMIMETypeDetector( new TikaMIMETypeDetector(new WhiteSpacesPurifier()) );
        return instance;
    }
View Full Code Here

                configuration,
                new HTMLFixture(copyResourceToTempFile(file)).getOpener("http://nested.test.com"),
                extractorGroup,
                cth
        );
        instance.setMIMETypeDetector( new TikaMIMETypeDetector(new WhiteSpacesPurifier()) );
        return instance;
    }
View Full Code Here

    private TikaMIMETypeDetector detector;

    @Before
    public void setUp() throws Exception {
        detector = new TikaMIMETypeDetector(new WhiteSpacesPurifier());
    }
View Full Code Here

                        format( "Storage folder %s can not be created, please verify you have enough permissions",
                                                         storageFolder ) );
            }
        }

        final SiteCrawler siteCrawler = new SiteCrawler( storageFolder );
        siteCrawler.setNumOfCrawlers( numCrawlers );
        siteCrawler.setMaxPages( maxPages );
        siteCrawler.setMaxDepth( maxDepth );
        siteCrawler.setPolitenessDelay(politenessDelay);

        siteCrawler.addListener(new CrawlerListener() {
            @Override
            public void visitedPage(Page page) {
                final String pageURL = page.getWebURL().getURL();
                System.err.println( format("Processing page: [%s]", pageURL) );

                final ParseData parseData = page.getParseData();
                if (parseData instanceof HtmlParseData) {
                    final HtmlParseData htmlParseData = (HtmlParseData) parseData;
                    try {
                        synchronized (roverLock) {
                            Crawler.super.performExtraction(
                                    new StringDocumentSource(
                                            htmlParseData.getHtml(),
                                            pageURL

                                    )
                            );
                        }
                    } catch (Exception e) {
                        System.err.println(format("Error while processing page [%s], error: %s .",
                                                  pageURL, e.getMessage())
                        );
                    }
                }
            }
        });

        Runtime.getRuntime().addShutdownHook( new Thread() {
            @Override
            public void run() {
                try {
                    System.err.println( Crawler.super.printReports() );
                    // siteCrawler.stop(); // TODO: cause shutdown hanging.
                } catch (Exception e) {
                    e.printStackTrace(System.err);
                }
            }
        });
        siteCrawler.start(seed, pageFilter, true);
    }
View Full Code Here

    private String getFormatFromRequestOrNegotiation(HttpServletRequest request) {
        String fromRequest = getFormatFromRequest(request);
        if (fromRequest != null && !"".equals(fromRequest) && !"best".equals(fromRequest)) {
            return fromRequest;
        }
        MediaRangeSpec result = Any23Negotiator.getNegotiator().getBestMatch(request.getHeader("Accept"));
        if (result == null) {
            return null;
        }
        else if (RDFFormat.TURTLE.hasMIMEType(result.getMediaType())) {
            return "turtle";
        }
        else if (RDFFormat.N3.hasMIMEType(result.getMediaType())) {
            return "n3";
        }
        else if (RDFFormat.NQUADS.hasMIMEType(result.getMediaType())) {
            return "nq";
        }
        else if (RDFFormat.RDFXML.hasMIMEType(result.getMediaType())) {
            return "rdf";
        }
        else if (RDFFormat.NTRIPLES.hasMIMEType(result.getMediaType())) {
            return "nt";
        }
        else {
            return "turtle";    // shouldn't happen
        }
View Full Code Here

            );
            return;
        }
        log("Attempting conversion to '" + format + "' from POST body");
        responder.runExtraction(
                new ByteArrayDocumentSource(
                        req.getInputStream(),
                        Servlet.DEFAULT_BASE_URI,
                        getContentTypeHeader(req)
                ),
                eps,
View Full Code Here

        /*2*/ final String content = "@prefix foo: <http://example.org/ns#> .   " +
                                     "@prefix : <http://other.example.org/ns#> ." +
                                     "foo:bar foo: : .                          " +
                                     ":bar : foo:bar .                           ";
        //    The second argument of StringDocumentSource() must be a valid URI.
        /*3*/ DocumentSource source = new StringDocumentSource(content, "http://host.com/service");
        /*4*/ ByteArrayOutputStream out = new ByteArrayOutputStream();
        /*5*/ TripleHandler handler = new NTriplesWriter(out);
              try {
        /*6*/     runner.extract(source, handler);
              } finally {
View Full Code Here

        assumeOnlineAllowed();

        /*1*/ Any23 runner = new Any23();
        /*2*/ runner.setHTTPUserAgent("test-user-agent");
        /*3*/ HTTPClient httpClient = runner.getHTTPClient();
        /*4*/ DocumentSource source = new HTTPDocumentSource(
                 httpClient,
                 "http://dbpedia.org/resource/Trento"
              );
        /*5*/ ByteArrayOutputStream out = new ByteArrayOutputStream();
        /*6*/ TripleHandler handler = new NTriplesWriter(out);
View Full Code Here

        ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
        TripleHandler handler = new RDFXMLWriter(byteArrayOutputStream);
        TripleHandler rdfWriter = new IgnoreAccidentalRDFa(handler);
        ReportingTripleHandler reporting = new ReportingTripleHandler(rdfWriter);

        DocumentSource source = getDocumentSourceFromResource(
                    "/html/rdfa/ansa_2010-02-26_12645863.html",
                    "http://host.com/service");

        Assert.assertTrue( any23.extract(source, reporting).hasMatchingExtractors() );
        try {
View Full Code Here

TOP

Related Classes of org.apache.any23.plugin.crawler.SiteCrawler

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.