Package org.archive.modules

Examples of org.archive.modules.CrawlMetadata


        record.addLabelValue("format","WARC File Format 1.0");
        record.addLabelValue("conformsTo","http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf");
       
        // Get other values from metadata provider

        CrawlMetadata provider = getMetadataProvider();

        addIfNotBlank(record,"operator", provider.getOperator());
        addIfNotBlank(record,"publisher", provider.getOrganization());
        addIfNotBlank(record,"audience", provider.getAudience());
        addIfNotBlank(record,"isPartOf", provider.getJobName());
        // TODO: make date match 'job creation date' as in Heritrix 1.x
        // until then, leave out (plenty of dates already in WARC
        // records
//            String rawDate = provider.getBeginDate();
//            if(StringUtils.isNotBlank(rawDate)) {
//                Date date;
//                try {
//                    date = ArchiveUtils.parse14DigitDate(rawDate);
//                    addIfNotBlank(record,"created",ArchiveUtils.getLog14Date(date));
//                } catch (ParseException e) {
//                    logger.log(Level.WARNING,"obtaining warc created date",e);
//                }
//            }
        addIfNotBlank(record,"description", provider.getDescription());
        addIfNotBlank(record,"robots", provider.getRobotsPolicyName().toLowerCase());

        addIfNotBlank(record,"http-header-user-agent",
                provider.getUserAgent());
        addIfNotBlank(record,"http-header-from",
                provider.getOperatorFrom());

        // really ugly to return as List<String>, but changing would require
        // larger refactoring
        return Collections.singletonList(record.toString());
    }
View Full Code Here


    @Override
    protected Extractor makeExtractor() {
        ExtractorHTML result = new ExtractorHTML();
        UriErrorLoggerModule ulm = new UnitTestUriLoggerModule()
        result.setLoggerModule(ulm);
        CrawlMetadata metadata = new CrawlMetadata();
        metadata.afterPropertiesSet();
        result.setMetadata(metadata);
        result.setExtractorJS(new ExtractorJS());
        result.afterPropertiesSet();
        return result;
    }
View Full Code Here

            "<!--[if IE 6]><img src=\"foo.gif\"><![endif]-->" +
            "<!--[if IE 6]><script src=\"foo.js\"><![endif]-->";
        UriErrorLoggerModule ulm = new UnitTestUriLoggerModule()
        getExtractor().setLoggerModule(ulm);
        CrawlMetadata metadata = new CrawlMetadata();
        metadata.afterPropertiesSet();
        getExtractor().setMetadata(metadata);
        getExtractor().afterPropertiesSet();
       
        getExtractor().extract(curi, cs);
       
View Full Code Here

    @Override
    protected Extractor makeExtractor() {
        JerichoExtractorHTML result = new JerichoExtractorHTML();
        UriErrorLoggerModule ulm = new UnitTestUriLoggerModule();
        result.setLoggerModule(ulm);
        CrawlMetadata metadata = new CrawlMetadata();
        metadata.afterPropertiesSet();
        result.setMetadata(metadata);
        result.setExtractorJS(new ExtractorJS());
        result.afterPropertiesSet();
        return result;
    }
View Full Code Here

   
    public static FetchHTTP newTestFetchHttp(String userAgentString) {
        FetchHTTP fetchHttp = new FetchHTTP();
        fetchHttp.setCookieStore(new SimpleCookieStore());
        fetchHttp.setServerCache(new DefaultServerCache());
        CrawlMetadata uap = new CrawlMetadata();
        uap.setUserAgentTemplate(userAgentString);
        fetchHttp.setUserAgentProvider(uap);

        fetchHttp.start();
        return fetchHttp;
    }
View Full Code Here

        FileUtils.ensureWriteableDirectory(tmp);
       
        ARCWriterProcessor result = new ARCWriterProcessor();
        result.setDirectory(new ConfigPath("test",tmp.getAbsolutePath()));
        result.setServerCache(new DefaultServerCache());
        CrawlMetadata metadata = new CrawlMetadata();
        metadata.afterPropertiesSet();
        result.setMetadataProvider(metadata);
        result.start();
        return result;
    }
View Full Code Here

        FileUtils.ensureWriteableDirectory(tmp);

        WARCWriterProcessor result = new WARCWriterProcessor();
        result.setDirectory(new ConfigPath("test",tmp.getAbsolutePath()));
        result.setServerCache(new DefaultServerCache());
        CrawlMetadata metadata = new CrawlMetadata();
        metadata.afterPropertiesSet();
        result.setMetadataProvider(metadata);
        return result;
    }
View Full Code Here

        protected FetchHTTP fetcher() {
            if (fetcher == null) {
                fetcher = new FetchHTTP();
                // f.setCookieStore(cookieStore);
                fetcher.setServerCache(new AlwaysLocalhostServerCache());
                CrawlMetadata uap = new CrawlMetadata();
                uap.setUserAgentTemplate(getClass().getName());
                fetcher.setUserAgentProvider(uap);
                fetcher.start();
            }
            return fetcher;
        }
View Full Code Here

    protected FetchHTTP getFetcher() throws IOException {
        if (fetchHttp == null) {
            fetchHttp = new FetchHTTP();
            fetchHttp.setCookieStore(new SimpleCookieStore());
            fetchHttp.setServerCache(new DefaultServerCache());
            CrawlMetadata uap = new CrawlMetadata();
            uap.setUserAgentTemplate(getClass().getName());
            fetchHttp.setUserAgentProvider(uap);
           
            fetchHttp.start();
        }
       
View Full Code Here

TOP

Related Classes of org.archive.modules.CrawlMetadata

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.