Package com.scraper.parser

Source Code of com.scraper.parser.HTMLParser

package com.scraper.parser;

import java.io.IOException;
import java.net.SocketTimeoutException;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.scraper.model.Image;

public class HTMLParser {
  public static final String IMG_FILETYPES = "(png|jpe?g|gif)";
  public static final String IMG_PATTERN = "img[src~=(?i)\\." + IMG_FILETYPES
      + "]";
  public static final String IMG_SUFFIX_PATTERN = ".*\\." + IMG_FILETYPES;
  public static final String IMG_URL_PREFIX = "https?://.*";
  private List<String> urls = new ArrayList<>();
  private Iterator<Element> iterator;
  private String url;
  private String error;
  private boolean hasError;

  private HTMLParser(Iterator<Element> iterator, String url) {
    this.url = url;
    this.iterator = iterator;
  }

  private HTMLParser(String error) {
    this.error = error;
    hasError = true;
  }

  public static HTMLParser parseImages(String url)
  {
    HTMLParser parser = null;
    Document doc = null;
    try
    {
      url = url.matches(IMG_URL_PREFIX) ? url : "http://" + url;
      url = url.endsWith("/") ? url.substring(0, url.length()) : url;
      doc = Jsoup.connect(url).get();
      Elements images = doc.select(IMG_PATTERN);
      parser = new HTMLParser(images.iterator(), url);
    }
    catch (IllegalArgumentException ae)
    {
      parser = new HTMLParser("Invalid URL.");
      ae.printStackTrace();
    }
    catch (UnknownHostException uhe)
    {
      parser = new HTMLParser("Unable to resolve " + url);
      uhe.printStackTrace();
    }
    catch (SocketTimeoutException ste)
    {
      parser = new HTMLParser("Connection timed out.");
      ste.printStackTrace();
    }
    catch (IOException e)
    {
      e.printStackTrace();
    }

    return parser;
  }

  public Image nextImage()
  {
    Image image = null;
    if(iterator.hasNext())
    {
      Element el = iterator.next();
      String url = el.attr("src");

      if(url != null && url.trim().length() > 0
          && url.matches(IMG_SUFFIX_PATTERN) && !urls.contains(url))
      {
        image = new Image();
        int suffixIndex = url.lastIndexOf('.') + 1;
        int nameIndex = url.lastIndexOf('/') + 1;
        image.setName(url.substring(nameIndex));
        image.setExtension(url.substring(suffixIndex));
        url = url.matches(IMG_URL_PREFIX) ? url : (this.url + url);
        image.setUrl(url);
        urls.add(url);
      }
    }

    return image;
  }

  public boolean hasNextImage()
  {
    return iterator != null && iterator.hasNext();
  }

  public boolean hasError()
  {
    return hasError;
  }

  public String getError()
  {
    return error;
  }
}
TOP

Related Classes of com.scraper.parser.HTMLParser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.