Examples of FetchedDatum


Examples of bixo.datum.FetchedDatum

       
        String contentType = "text/html; charset=utf-8";
        HttpHeaders headers = new HttpHeaders();
        headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
        ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
        FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);
       
        // Call parser.parse
        Set<String> linkTags =
            new HashSet<String>() {{
                add("a");
View Full Code Here

Examples of bixo.datum.FetchedDatum

    String url = "http://domain.com/simple-content.html";
    String contentType = "text/html; charset=utf-8";
    HttpHeaders headers = new HttpHeaders();
    headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
    ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
    FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);
   
    // Call parser.parse
    SimpleParser parser = new SimpleParser();
    ParsedDatum parsedDatum = parser.parse(fetchedDatum);
   
View Full Code Here

Examples of bixo.datum.FetchedDatum

    @Test
    public void testHtmlParsing() throws Exception {
        URL path = SimpleParserTest.class.getResource("/simple-page.html");

        BaseParser parser = new SimpleParser();
        FetchedDatum content = makeFetchedDatum(path);
        ParsedDatum parse = parser.parse(content);
        Assert.assertNotNull(parse.getParsedText());
       
        // TODO - add back in title text to simple-page, when we generate this
        File parsedTextFile = new File(SimpleParserTest.class.getResource("/" + "simple-page.txt").getFile());
View Full Code Here

Examples of bixo.datum.FetchedDatum

        String url = "http://domain.com/simple-content.html";
        String contentType = "text/html; charset=utf-8";
        HttpHeaders headers = new HttpHeaders();
        headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
        ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
        FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);
       
        SimpleParser parser = new SimpleParser(new BaseContentExtractor() {

            @Override
            public String getContent() {
View Full Code Here

Examples of bixo.datum.FetchedDatum

        String url = "http://domain.com/meta-nofollow.html";
        String contentType = "text/html; charset=utf-8";
        HttpHeaders headers = new HttpHeaders();
        headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
        ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
        FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);
       
        ParserPolicy policy = new ParserPolicy(Integer.MAX_VALUE);
        SimpleParser parser = new SimpleParser(policy);
        ParsedDatum parsedDatum = parser.parse(fetchedDatum);
       
View Full Code Here

Examples of bixo.datum.FetchedDatum

    HttpHeaders headers = new HttpHeaders();
    headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
    headers.add(HttpHeaderNames.CONTENT_LANGUAGE, "en");

    ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
    FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);
   
    // Call parser.parse
    SimpleParser parser = new SimpleParser();
    ParsedDatum parsedDatum = parser.parse(fetchedDatum);
   
View Full Code Here

Examples of bixo.datum.FetchedDatum

    HttpHeaders headers = new HttpHeaders();
    headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
    headers.add(HttpHeaderNames.CONTENT_LANGUAGE, "en");

    ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
    FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);
   
    // Call parser.parse
    SimpleParser parser = new SimpleParser();
    ParsedDatum parsedDatum = parser.parse(fetchedDatum);
   
View Full Code Here

Examples of bixo.datum.FetchedDatum

 
  @Test
  public void testSplitterWithNonMbox() {
    MboxSplitterFunction splitter = new MboxSplitterFunction();
   
    FetchedDatum datum = new FetchedDatum("baseUrl", "redirectedUrl", 0, new HttpHeaders(), new ContentBytes(), "text/ascii", 0);
    TupleEntry value = new TupleEntry(datum.getTupleEntry());
   
    when(_funcCall.getArguments()).thenReturn(value);
    splitter.operate(_process, _funcCall);
   
    verify(_collector).add(value);
View Full Code Here

Examples of bixo.datum.FetchedDatum

  public void testSplitterTwoEmails() throws UnsupportedEncodingException {
    MboxSplitterFunction splitter = new MboxSplitterFunction();

    final String mboxString = "From 1\r\rContent 1\r\rFrom 2\r\rContent 2";
    byte[] mboxContent = mboxString.getBytes("us-ascii");
    FetchedDatum datum = new FetchedDatum("baseUrl", "redirectedUrl", 0, new HttpHeaders(), new ContentBytes(mboxContent), "application/mbox", 0);
    TupleEntry value = new TupleEntry(datum.getTupleEntry());
   
    when(_funcCall.getArguments()).thenReturn(value);
    splitter.operate(_process, _funcCall);

    verify(_collector, times(2)).add(any(TupleEntry.class));
View Full Code Here

Examples of bixo.datum.FetchedDatum

              } else {
                url = args[index++];
              }

              System.out.println("Fetching " + url);
            FetchedDatum result = fetcher.get(new ScoredUrlDatum(url));
            System.out.println(String.format("Fetched %s: headers = %s", result.getUrl(), result.getHeaders()));
            System.out.flush();
           
            // System.out.println("Result = " + result.toString());
            ParsedDatum parsed = parser.parse(result);
            System.out.println(String.format("Parsed %s: lang = %s, size = %d", parsed.getUrl(),
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.