Examples of bixo.config.ParserPolicy

bixo.config.ParserPolicy
Definition of policy for parsing.


@SuppressWarnings("serial")
public class FakeParser extends BaseParser {


    public FakeParser() {
        super(new ParserPolicy());
    }

View Full Code Here

        contentPipe = TupleLogger.makePipe(contentPipe, true);
        
        // Take content and split it into content output plus parse to extract URLs.
        SimpleParser parser;
        if (options.isUseBoilerpipe()) {
            parser = new SimpleParser(new BoilerpipeContentExtractor(), new SimpleLinkExtractor(), new ParserPolicy());
        } else if (options.isGenerateHTML()) {
            parser = new SimpleParser(new HtmlContentExtractor(), new SimpleLinkExtractor(), new ParserPolicy(), true);
        } else {
            parser = new SimpleParser();
        }
        
        parser.setExtractLanguage(false);

View Full Code Here

        headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
        ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
        FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);
        
        // Call parser.parse
        ParserPolicy policy = new ParserPolicy( ParserPolicy.DEFAULT_MAX_PARSE_DURATION,
                                                BaseLinkExtractor.ALL_LINK_TAGS,
                                                BaseLinkExtractor.ALL_LINK_ATTRIBUTE_TYPES);
        SimpleParser parser = new SimpleParser(policy);
        ParsedDatum parsedDatum = parser.parse(fetchedDatum);

View Full Code Here

            new HashSet<String>() {{
                add("href");
                add("src");
            }};


        ParserPolicy policy = new ParserPolicy( ParserPolicy.DEFAULT_MAX_PARSE_DURATION,
                                                linkTags,
                                                linkAttributeTypes);
        SimpleParser parser = new SimpleParser(policy);
        ParsedDatum parsedDatum = parser.parse(fetchedDatum);

View Full Code Here

            @Override
            public Outlink[] getLinks() {
                return new Outlink[0];
            }
        },
        new ParserPolicy());
        
        ParsedDatum parsedDatum = parser.parse(fetchedDatum);
        
        // Verify content is correct
        Assert.assertEquals("Simple", parsedDatum.getTitle());

View Full Code Here

        HttpHeaders headers = new HttpHeaders();
        headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
        ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
        FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);
        
        ParserPolicy policy = new ParserPolicy(Integer.MAX_VALUE);
        SimpleParser parser = new SimpleParser(policy);
        ParsedDatum parsedDatum = parser.parse(fetchedDatum);
        
        // Verify we got no URLs
        Assert.assertEquals(0, parsedDatum.getOutlinks().length);

View Full Code Here

        headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
        ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
        FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);
        
        // Call parser.parse
        ParserPolicy policy = new ParserPolicy( ParserPolicy.NO_MAX_PARSE_DURATION,
                                                BaseLinkExtractor.ALL_LINK_TAGS,
                                                BaseLinkExtractor.ALL_LINK_ATTRIBUTE_TYPES);
        SimpleParser parser = new SimpleParser(new SimpleContentExtractor(), new SimpleLinkExtractor(), policy, true);
        ParsedDatum parsedDatum = parser.parse(fetchedDatum);

View Full Code Here

        headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
        ContentBytes content = new ContentBytes(htmlText.getBytes("utf-8"));
        FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);
        
        // Call parser.parse
        SimpleParser parser = new SimpleParser(new ParserPolicy(), true);
        ParsedDatum parsedDatum = parser.parse(fetchedDatum);
        
        // Now take the resulting HTML, process it using Dom4J
        SAXReader reader = new SAXReader(new Parser());
        reader.setEncoding("UTF-8");

View Full Code Here

        policy.setMaxContentSize(options.getMaxSize());
        SimpleHttpFetcher fetcher = new SimpleHttpFetcher(1, policy, new FirefoxUserAgent());
        fetcher.setMaxRetryCount(options.getMaxRetries());
        
        // Give a long timeout for parsing
        ParserPolicy parserPolicy = new ParserPolicy(MAX_PARSE_DURATION);
        SimpleParser parser = new SimpleParser(parserPolicy);


        SimpleParser rawParser = new SimpleParser(parserPolicy, true);
        
        // Create Boilperpipe content extractor

View Full Code Here

        Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe());
        Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe());
        contentPipe = TupleLogger.makePipe(contentPipe, true);


        // Create a parser that returns back the raw HTML (cleaned up by Tika) as the parsed content.
        SimpleParser parser = new SimpleParser(new ParserPolicy(), true);
        ParsePipe parsePipe = new ParsePipe(fetchPipe.getContentTailPipe(), parser);
        
        Pipe analyzerPipe = new Pipe("analyzer pipe");
        analyzerPipe = new Each(parsePipe.getTailPipe(), new AnalyzeHtml());

View Full Code Here

0 1

TOP

Related Classes of bixo.config.ParserPolicy

bixo.examples.crawl.DemoCrawlWorkflow

bixo.examples.webmining.DemoWebMiningWorkflow

bixo.parser.FakeParser

bixo.parser.SimpleParser

bixo.parser.SimpleParserTest

bixo.tools.FetchAndParseTool

java.security.InvalidParameterException

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.