Examples of LinkExtractor


Examples of com.ontometrics.scraper.extraction.LinkExtractor

//  private static final String listingTableKeyword = "Opportunity Title";
//  private static final String linkPattern = "mode=VIEW";

  @Test
  public void canExtractLinksFromListingPage() {
    List<Link> foundLinks = new LinkExtractor().source(html().url(PagedListingTable.getUrl())).getLinks();

    log.info("found {} links: {}", foundLinks.size(), foundLinks);
    assertThat(foundLinks.size(), is(greaterThan(0)));
  }
View Full Code Here

Examples of com.ontometrics.scraper.extraction.LinkExtractor

  @Test
    @Ignore
  public void canExtractLinksFromMultiplePagesThenFollowToDetailsPage() throws MalformedURLException {
    Iterator pageIterator = new Iterator() {

      List<Link> foundLinks = new LinkExtractor().source(html().url(PagedListingTable.getUrl())).getLinks();
      ListIterator<Link> foundLinksIterator = foundLinks.listIterator();

      @Override
      public URL next() {
        Link currentLink = (Link) foundLinksIterator.next();
        log.debug("current iterating page = {}", currentLink.getHref());

        String CurrentURLStr = PagedListingFolder.getPath() + "/" + currentLink.getHref();
        URL currentURL = TestUtil.getFileAsURL(CurrentURLStr);

        return currentURL;
      }

      @Override
      public boolean hasNext() {

        try {
          Link nextLink = foundLinks.get(foundLinksIterator.nextIndex());
          log.debug("next page to iterate = {}", nextLink.getHref());
        } catch (Exception e) {
          log.debug("Page iterator can't peek next link");
        }

        return foundLinksIterator.hasNext();
      }

      @Override
      public URL getBaseUrl() {
        // TODO Auto-generated method stub
        return PagedListingFolder.getUrl();
      }
    };
    HtmlExtractor htmlExtractor = html().url(PagedListingTable.getUrl());

    List<Record> records = new ListingDetailScraper().setConvertURLs(true)
        .iterator(pageIterator)
        .listing(new LinkExtractor().source(htmlExtractor))
        .details(new DefaultFieldExtractor().source(htmlExtractor))
        .getRecords();

    assertThat(records.size(), greaterThan(0));
View Full Code Here

Examples of org.apache.droids.parse.html.LinkExtractor

    String charset = entity.getCharset();
    if (charset == null) {
      charset = "UTF-8";
    }
    EchoHandler data = new EchoHandler(charset);
    LinkExtractor extractor = new LinkExtractor(link, elements);
   
    TeeContentHandler parallelHandler = new TeeContentHandler(data, extractor);

    InputStream instream = entity.obtainContent();
    try {
      parser.parse(instream, parallelHandler, metadata);
     
      return new ParseImpl(data.toString(), extractor.getLinks());
    } catch (SAXException ex) {
      throw new DroidsException("Failure parsing document " + link.getId(), ex);
    } catch (TikaException ex) {
      throw new DroidsException("Failure parsing document " + link.getId(), ex);
    } finally {
View Full Code Here

Examples of penny.parser.LinkExtractor

    private List<String> srcLinks;

    public LinkParser(Download download) throws URISyntaxException {
        this.download = download;
        parsingModel = Model.getApplicationSettings().getParsingModel();
        linkExtractor = new LinkExtractor(download.getUrl().toURI(), this);
        linkExtractor.setLinkState(download.getLinkState());
        hrefLinks = new ArrayList<String>();
        srcLinks = new ArrayList<String>();
    }
View Full Code Here

Examples of penny.parser.LinkExtractor

            }
        }
    }

    public void reset() throws URISyntaxException {
        linkExtractor = new LinkExtractor(download.getUrl().toURI(), this);
        download.setLinkState(linkExtractor.getLinkState());
        download.clearHrefLinks();
        download.clearSrcLinks();
    }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.