Examples of FetchedDatum


Examples of bixo.datum.FetchedDatum

          outputCollector.add(makeNewTupleEntry(fetchedDatum, email));
        }
  }
 
  private TupleEntry makeNewTupleEntry(FetchedDatum fetchedDatum, StringBuilder email) {
      FetchedDatum newDatum = new FetchedDatum(new TupleEntry(fetchedDatum.getTupleEntry()));
      newDatum.setContent(new ContentBytes(safeGetAsciiBytes(email.toString())));
      return newDatum.getTupleEntry();
  }
View Full Code Here

Examples of bixo.datum.FetchedDatum

  }
 
  @Override
  public void operate(FlowProcess process, FunctionCall<NullContext> functionCall) {
        TupleEntry arguments = functionCall.getArguments();
        FetchedDatum fetchedDatum = new FetchedDatum(arguments.getTuple());

        if (fetchedDatum.getContentType().startsWith("text/html")) {
          init();

          Metadata metadata = new Metadata();
          InputStream is = new ByteArrayInputStream(fetchedDatum.getContentBytes());
         
          try {
            _parser.parse(is, _handler, metadata, new ParseContext());

            // _ids now has a list of the mailbox IDs that we use to create URLs.
            for (String id : _ids) {
              String url = String.format("%s/%s.mbox", fetchedDatum.getUrl(), id);
              UrlDatum datum = new UrlDatum(url);
              functionCall.getOutputCollector().add(datum.getTuple());
            }
          } catch (Exception e) {
        LOGGER.error("Exception parsing mod_mbox page", e);
View Full Code Here

Examples of bixo.datum.FetchedDatum

        // "crawler" user agent name in the robots.txt file.
        final String simpleRobotsTxt = "User-agent: crawler" + "\r\n"
        + "Disallow: /";

        BaseFetcher fetcher = Mockito.mock(BaseFetcher.class);
        FetchedDatum datum = Mockito.mock(FetchedDatum.class);
        Mockito.when(datum.getContentBytes()).thenReturn(simpleRobotsTxt.getBytes());
        Mockito.when(fetcher.get(Mockito.any(ScoredUrlDatum.class))).thenReturn(datum);
        UserAgent userAgent = new UserAgent("testAgent", "crawler@domain.com", "http://www.domain.com");
        Mockito.when(fetcher.getUserAgent()).thenReturn(userAgent);
       
        URL robotsUrl = new URL("http://www.domain.com/robots.txt");
View Full Code Here

Examples of bixo.datum.FetchedDatum

            Thread.currentThread().interrupt();
        }

        HttpHeaders headers = new HttpHeaders();
        headers.add("x-responserate", "" + bytesPerSecond);
        FetchedDatum result = new FetchedDatum(url, url, System.currentTimeMillis(), headers, new ContentBytes(new byte[contentSize]), "text/html", bytesPerSecond);
        result.setPayload(payload);
        return result;
    }
View Full Code Here

Examples of bixo.datum.FetchedDatum

    String url = "http://olddomain.com/base-url.html";
    String contentType = "text/html; charset=utf-8";
    HttpHeaders headers = new HttpHeaders();
    headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
    ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
    FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);
   
    // Call parser.parse
    SimpleParser parser = new SimpleParser();
    ParsedDatum parsedDatum = parser.parse(fetchedDatum);
   
View Full Code Here

Examples of bixo.datum.FetchedDatum

    String contentType = "text/html; charset=utf-8";
    HttpHeaders headers = new HttpHeaders();
    headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
    headers.add(HttpHeaderNames.CONTENT_LOCATION, location);
    ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
    FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);
   
    // Call parser.parse
    SimpleParser parser = new SimpleParser();
    ParsedDatum parsedDatum = parser.parse(fetchedDatum);
   
View Full Code Here

Examples of bixo.datum.FetchedDatum

    String contentType = "text/html; charset=utf-8";
    HttpHeaders headers = new HttpHeaders();
    headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
    headers.add(HttpHeaderNames.CONTENT_LOCATION, location);
    ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
    FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);
   
    // Call parser.parse
    SimpleParser parser = new SimpleParser();
    ParsedDatum parsedDatum = parser.parse(fetchedDatum);
   
View Full Code Here

Examples of bixo.datum.FetchedDatum

   
    String contentType = "text/html; charset=utf-8";
    HttpHeaders headers = new HttpHeaders();
    headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
    ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
    FetchedDatum fetchedDatum = new FetchedDatum(url, redirectedUrl, System.currentTimeMillis(), headers, content, contentType, 0);
   
    // Call parser.parse
    SimpleParser parser = new SimpleParser();
    ParsedDatum parsedDatum = parser.parse(fetchedDatum);
   
View Full Code Here

Examples of bixo.datum.FetchedDatum

       
        String contentType = "text/html; charset=utf-8";
        HttpHeaders headers = new HttpHeaders();
        headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
        ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
        FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);
       
        // Call parser.parse
        SimpleParser parser = new SimpleParser();
        ParsedDatum parsedDatum = parser.parse(fetchedDatum);
       
View Full Code Here

Examples of bixo.datum.FetchedDatum

       
        String contentType = "text/html; charset=utf-8";
        HttpHeaders headers = new HttpHeaders();
        headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
        ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
        FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);
       
        // Call parser.parse
        ParserPolicy policy = new ParserPolicy( ParserPolicy.DEFAULT_MAX_PARSE_DURATION,
                                                BaseLinkExtractor.ALL_LINK_TAGS,
                                                BaseLinkExtractor.ALL_LINK_ATTRIBUTE_TYPES);
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.