Package bixo.datum

Examples of bixo.datum.ParsedDatum


        super(new ParserPolicy());
    }

    @Override
    public ParsedDatum parse(FetchedDatum fetchedDatum) {
        ParsedDatum parsedDatum = new ParsedDatum("url", "127.0.0.1", "someParsedText", "en", "title", new Outlink[0], null);
        parsedDatum.setPayload(fetchedDatum.getPayload());
        return parsedDatum;
    }
View Full Code Here


        LOGGER.info("Ending creation writable sequence file tuples");
    }

    @Override
    public void operate(FlowProcess flowProcess, FunctionCall<NullContext> funcCall) {
        ParsedDatum datum = new ParsedDatum(funcCall.getArguments());
        Text key = new Text(datum.getUrl());
        Text value = new Text(datum.getTitle() + '\n' + datum.getParsedText());
        Tuple keyVal = new Tuple(key, value);
        funcCall.getOutputCollector().add(keyVal);
    }
View Full Code Here

        LOGGER.info("Ending creation of outlink URLs");
    }

    @Override
    public void operate(FlowProcess process, FunctionCall<NullContext> funcCall) {
        ParsedDatum datum = new ParsedDatum(funcCall.getArguments());
        Outlink outlinks[] = datum.getOutlinks();

        // Bump the crawl depth value only on a successful parse
        int crawlDepth = (Integer) datum.getPayloadValue(CrawlDbDatum.CRAWL_DEPTH);
        datum.setPayloadValue(CrawlDbDatum.CRAWL_DEPTH, crawlDepth + 1);

        TupleEntryCollector collector = funcCall.getOutputCollector();

        for (Outlink outlink : outlinks) {
            String url = outlink.getToUrl();
            url = url.replaceAll("[\n\r]", "");
            url = _normalizer.normalize(url);
            if (_validator.isValid(url)) {
                UrlDatum urlDatum = new UrlDatum(url);
                urlDatum.setPayload(datum.getPayload());
                collector.add(urlDatum.getTuple());
            }
        }
    }
View Full Code Here

            FetchedDatum result = fetcher.get(new ScoredUrlDatum(url));
            System.out.println(String.format("Fetched %s: headers = %s", result.getUrl(), result.getHeaders()));
            System.out.flush();
           
            // System.out.println("Result = " + result.toString());
            ParsedDatum parsed = parser.parse(result);
            System.out.println(String.format("Parsed %s: lang = %s, size = %d", parsed.getUrl(),
                            parsed.getLanguage(), parsed.getParsedText().length()));
           
            ParsedDatum bpParsed = bpParser.parse(result);
            ParsedDatum rawParsed = rawParser.parse(result);
           
            if (interactive) {
                while (true) {
                    System.out.print("Next action - (d)ump regular, dump (b)oilerpipe, dump (r)aw, (e)xit: ");
                    String action = readInputLine();
                    if (action.startsWith("e") || (action.length() == 0)) {
                        break;
                        } else if (action.startsWith("d")) {
                            System.out.println("=====================================================================");
                            System.out.println(parsed.getParsedText());
                            System.out.println("=====================================================================");
                        } else if (action.startsWith("b")) {
                            System.out.println("=====================================================================");
                            System.out.println(bpParsed.getParsedText());
                            System.out.println("=====================================================================");
                        } else if (action.startsWith("r")) {
                            System.out.println("=====================================================================");
                            System.out.println(rawParsed.getParsedText());
                            System.out.println("=====================================================================");
                    } else {
                        System.out.println("Unknown command - " + action);
                    }
                }
View Full Code Here

    ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
    FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);
   
    // Call parser.parse
    SimpleParser parser = new SimpleParser();
    ParsedDatum parsedDatum = parser.parse(fetchedDatum);
   
    // Verify outlink is correct.
    Outlink[] outlinks = parsedDatum.getOutlinks();
    Assert.assertEquals(2, outlinks.length);
   
    // TODO KKr - reenable this test when Tika parser calls my handler with
    // the <base> element, which is needed to correctly resolve relative links.
    // Assert.assertEquals("http://newdomain.com/link", outlinks[0].getToUrl());
View Full Code Here

    ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
    FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);
   
    // Call parser.parse
    SimpleParser parser = new SimpleParser();
    ParsedDatum parsedDatum = parser.parse(fetchedDatum);
   
    // Verify outlink is correct.
    Outlink[] outlinks = parsedDatum.getOutlinks();
    Assert.assertEquals(2, outlinks.length);
   
    Assert.assertEquals("http://newdomain.com/link1", outlinks[0].getToUrl());
    Assert.assertEquals("link1", outlinks[0].getAnchor());
        // TODO KKr - reenable this test when Tika changes are submitted:
View Full Code Here

    ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
    FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);
   
    // Call parser.parse
    SimpleParser parser = new SimpleParser();
    ParsedDatum parsedDatum = parser.parse(fetchedDatum);
   
    // Verify outlink is correct.
    Outlink[] outlinks = parsedDatum.getOutlinks();
    Assert.assertEquals(2, outlinks.length);
   
    Assert.assertEquals("http://olddomain.com/redirected/link1", outlinks[0].getToUrl());
    Assert.assertEquals("link1", outlinks[0].getAnchor());
    Assert.assertEquals("http://domain.com/link2", outlinks[1].getToUrl());
View Full Code Here

    ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
    FetchedDatum fetchedDatum = new FetchedDatum(url, redirectedUrl, System.currentTimeMillis(), headers, content, contentType, 0);
   
    // Call parser.parse
    SimpleParser parser = new SimpleParser();
    ParsedDatum parsedDatum = parser.parse(fetchedDatum);
   
    // Verify outlink is correct.
    Outlink[] outlinks = parsedDatum.getOutlinks();
    Assert.assertEquals(2, outlinks.length);
   
    Assert.assertEquals("http://newdomain.com/link1", outlinks[0].getToUrl());
    Assert.assertEquals("link1", outlinks[0].getAnchor());
    Assert.assertEquals("http://domain.com/link2", outlinks[1].getToUrl());
View Full Code Here

        ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
        FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);
       
        // Call parser.parse
        SimpleParser parser = new SimpleParser();
        ParsedDatum parsedDatum = parser.parse(fetchedDatum);
       
        // Verify outlinks are correct (and we only get the a href ones).
        Outlink[] outlinks = parsedDatum.getOutlinks();
        Assert.assertEquals(2, outlinks.length);
       
        Assert.assertEquals("http://newdomain.com/link1", outlinks[0].getToUrl());
        Assert.assertEquals("link1", outlinks[0].getAnchor());
        Assert.assertEquals("http://domain.com/link2", outlinks[1].getToUrl());
View Full Code Here

        // Call parser.parse
        ParserPolicy policy = new ParserPolicy( ParserPolicy.DEFAULT_MAX_PARSE_DURATION,
                                                BaseLinkExtractor.ALL_LINK_TAGS,
                                                BaseLinkExtractor.ALL_LINK_ATTRIBUTE_TYPES);
        SimpleParser parser = new SimpleParser(policy);
        ParsedDatum parsedDatum = parser.parse(fetchedDatum);
       
        // Verify outlinks are correct (and we only get the a href ones).
        Outlink[] outlinks = parsedDatum.getOutlinks();
        Assert.assertEquals(7, outlinks.length);
       
        Assert.assertEquals("http://newdomain.com/favicon.ico", outlinks[0].getToUrl());
        Assert.assertEquals("http://newdomain.com/link1", outlinks[1].getToUrl());
        Assert.assertEquals("link1", outlinks[1].getAnchor());
View Full Code Here

TOP

Related Classes of bixo.datum.ParsedDatum

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.