@PersistenceContext(unitName = "SearchEnginePU")
private EntityManager entityManager;
public void runUrlScanner()
{
Url url = entityManager.createNamedQuery("Url.findByStatus", Url.class).setParameter("status", UrlStatus.NEW).setMaxResults(1).getSingleResult();
Document document = null;
url.setStatus(UrlStatus.SCANNING);
entityManager.flush();
// Test url if valid
try
{
document = Jsoup.connect(url.getId()).get();
}
catch (IOException ex)
{
logger.log(Level.SEVERE, null, ex);
url.setStatus(UrlStatus.ERROR);
return;
}
// Parse and add new urls
Elements links = document.select("a[href]");
Set<Url> urls = new HashSet<>();
System.out.println("URL: " + url);
System.out.println("Links: " + links.size());
for (Element link : links)
{
System.out.println("\t" + link.attr("abs:href") + " (" + link.text() + ")");
urls.add(new Url(link.attr("abs:href"), UrlStatus.NEW));
}
for (Url currentUrl : urls)
{
try
{
entityManager.persist(currentUrl);
}
catch(Exception e)
{
logger.log(Level.SEVERE, "Exception: " + url + " already exists", e);
}
}
url.setDocument(document.html());
url.setStatus(UrlStatus.SCANNED);
url.setScannedOn(new Date());
}