package com.scraper.parser;
import java.io.IOException;
import java.net.SocketTimeoutException;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.scraper.model.Image;
public class HTMLParser {
public static final String IMG_FILETYPES = "(png|jpe?g|gif)";
public static final String IMG_PATTERN = "img[src~=(?i)\\." + IMG_FILETYPES
+ "]";
public static final String IMG_SUFFIX_PATTERN = ".*\\." + IMG_FILETYPES;
public static final String IMG_URL_PREFIX = "https?://.*";
private List<String> urls = new ArrayList<>();
private Iterator<Element> iterator;
private String url;
private String error;
private boolean hasError;
private HTMLParser(Iterator<Element> iterator, String url) {
this.url = url;
this.iterator = iterator;
}
private HTMLParser(String error) {
this.error = error;
hasError = true;
}
public static HTMLParser parseImages(String url)
{
HTMLParser parser = null;
Document doc = null;
try
{
url = url.matches(IMG_URL_PREFIX) ? url : "http://" + url;
url = url.endsWith("/") ? url.substring(0, url.length()) : url;
doc = Jsoup.connect(url).get();
Elements images = doc.select(IMG_PATTERN);
parser = new HTMLParser(images.iterator(), url);
}
catch (IllegalArgumentException ae)
{
parser = new HTMLParser("Invalid URL.");
ae.printStackTrace();
}
catch (UnknownHostException uhe)
{
parser = new HTMLParser("Unable to resolve " + url);
uhe.printStackTrace();
}
catch (SocketTimeoutException ste)
{
parser = new HTMLParser("Connection timed out.");
ste.printStackTrace();
}
catch (IOException e)
{
e.printStackTrace();
}
return parser;
}
public Image nextImage()
{
Image image = null;
if(iterator.hasNext())
{
Element el = iterator.next();
String url = el.attr("src");
if(url != null && url.trim().length() > 0
&& url.matches(IMG_SUFFIX_PATTERN) && !urls.contains(url))
{
image = new Image();
int suffixIndex = url.lastIndexOf('.') + 1;
int nameIndex = url.lastIndexOf('/') + 1;
image.setName(url.substring(nameIndex));
image.setExtension(url.substring(suffixIndex));
url = url.matches(IMG_URL_PREFIX) ? url : (this.url + url);
image.setUrl(url);
urls.add(url);
}
}
return image;
}
public boolean hasNextImage()
{
return iterator != null && iterator.hasNext();
}
public boolean hasError()
{
return hasError;
}
public String getError()
{
return error;
}
}