Examples of FetchedDatum


Examples of bixo.datum.FetchedDatum

  public void operate(FlowProcess process, FunctionCall<NullContext> functionCall) {
    init();
   
    // On input we have a FetchedDatum that holds a single email.
        TupleEntry arguments = functionCall.getArguments();
        FetchedDatum fetchedDatum = new FetchedDatum(arguments.getTuple());
       
        // Now, if the FetchedDatum mime-type is application/mbox, we want to parse it and
        // output the results
        if (fetchedDatum.getContentType().equals("application/mbox")) {
          Metadata metadata = new Metadata();
          ParseContext context = new ParseContext();
          InputStream is = new ByteArrayInputStream(fetchedDatum.getContentBytes());
         
          try {
            _parser.parse(is, _handler, metadata, context);

            // _content now has all of the body text, and metadata has the header info.
View Full Code Here

Examples of bixo.datum.FetchedDatum

    HttpHeaders headers = new HttpHeaders();
    headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
    headers.add(HttpHeaderNames.CONTENT_LANGUAGE, "en");

    ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
    FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);
   
    // Call parser.parse
    SimpleParser parser = new SimpleParser();
    ParsedDatum parsedDatum = parser.parse(fetchedDatum);
   
View Full Code Here

Examples of bixo.datum.FetchedDatum

        String url = "http://domain.com/music.html";
        String contentType = "text/html; charset=utf-8";
        HttpHeaders headers = new HttpHeaders();
        headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
        ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
        FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);
       
        // Call parser.parse
        ParserPolicy policy = new ParserPolicy( ParserPolicy.NO_MAX_PARSE_DURATION,
                                                BaseLinkExtractor.ALL_LINK_TAGS,
                                                BaseLinkExtractor.ALL_LINK_ATTRIBUTE_TYPES);
View Full Code Here

Examples of bixo.datum.FetchedDatum

        String url = "http://domain.com/page.html";
        String contentType = "text/html; charset=utf-8";
        HttpHeaders headers = new HttpHeaders();
        headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
        ContentBytes content = new ContentBytes(htmlText.getBytes("utf-8"));
        FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0);
       
        // Call parser.parse
        SimpleParser parser = new SimpleParser(new ParserPolicy(), true);
        ParsedDatum parsedDatum = parser.parse(fetchedDatum);
       
View Full Code Here

Examples of bixo.datum.FetchedDatum

        byte[] bytes = new byte[(int) file.length()];
        DataInputStream in = new DataInputStream(new FileInputStream(file));
        in.readFully(bytes);

        String url = path.toExternalForm().toString();
        FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), new HttpHeaders(), new ContentBytes(bytes), "text/html", 0);
        return fetchedDatum;
    }
View Full Code Here

Examples of bixo.datum.FetchedDatum

        while (tupleEntryIterator.hasNext()) {
            TupleEntry entry = tupleEntryIterator.next();
            totalEntries += 1;

            // Verify we can convert properly
            FetchedDatum datum = new FetchedDatum(entry);
            String url = datum.getUrl();
            Assert.assertNotNull(url);
           
            // Verify that we got one of each page
            int idOffset = url.indexOf(".html") - 1;
            int pageId = Integer.parseInt(url.substring(idOffset, idOffset + 1));
View Full Code Here

Examples of bixo.datum.FetchedDatum

        int totalEntries = 0;
        while (tupleEntryIterator.hasNext()) {
            TupleEntry entry = tupleEntryIterator.next();
            totalEntries += 1;
           
            FetchedDatum datum = new FetchedDatum(entry);
            String payloadValue = (String)datum.getPayloadValue("key");
            Assert.assertNotNull(payloadValue);
            Assert.assertEquals("value", payloadValue);
        }
       
        Assert.assertEquals(1, totalEntries);
View Full Code Here

Examples of bixo.datum.FetchedDatum

                for (String key : keys) {
                    String value = header.getHeaderValue(key).toString();
                    headers.add(key, value);
                }
               
                FetchedDatum contentTuple = new FetchedDatum(url, url, System.currentTimeMillis(), headers, new ContentBytes(content), mimetype, 0);
                write.add(contentTuple.getTuple());
            }
        }

        write.close();
        FlowConnector flowConnector = platform.makeFlowConnector();
View Full Code Here

Examples of bixo.datum.FetchedDatum

    @Override
    public void run() {
        String redirectedUrl = _url;

        try {
            FetchedDatum fd = _fetcher.get(new ScoredUrlDatum(_url));
            redirectedUrl = fd.getFetchedUrl();
            LOGGER.debug(String.format("No redirection of %s to %s", _url, redirectedUrl));
        } catch (RedirectFetchException e) {
            // We'll get this exception if the URL that's redirected by
            // a link shortening site is to a URL that gets redirected again.
            // In this case, we've captured the final URL in the exception,
View Full Code Here

Examples of bixo.datum.FetchedDatum

            throw new IllegalStateException("Can't change accept encoding after HttpClient has been initialized");
        }
    }
   
    private static FetchedDatum convert(FetchedResult result) {
      FetchedDatum datum = new FetchedDatum(result.getBaseUrl(), result.getFetchedUrl(), result.getFetchTime(),
                      result.getHeaders(), new ContentBytes(result.getContent()), result.getContentType(),
                      result.getResponseRate());
      datum.setNewBaseUrl(result.getNewBaseUrl());
      datum.setNumRedirects(result.getNumRedirects());
      datum.setHostAddress(result.getHostAddress());
      datum.setPayload(result.getPayload());
      return datum;
    }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.