/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package cosc561.searchengine.ejb;
import cosc561.searchengine.entities.Url;
import cosc561.searchengine.enums.UrlStatus;
import java.io.IOException;
import java.util.Date;
import java.util.HashSet;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.ejb.Stateless;
import javax.persistence.EntityManager;
import javax.persistence.PersistenceContext;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
*
* @author jraymond
*/
@Stateless
public class UrlService
{
static final Logger logger = Logger.getLogger(UrlService.class.getName());
@PersistenceContext(unitName = "SearchEnginePU")
private EntityManager entityManager;
public void runUrlScanner()
{
Url url = entityManager.createNamedQuery("Url.findByStatus", Url.class).setParameter("status", UrlStatus.NEW).setMaxResults(1).getSingleResult();
Document document = null;
url.setStatus(UrlStatus.SCANNING);
entityManager.flush();
// Test url if valid
try
{
document = Jsoup.connect(url.getId()).get();
}
catch (IOException ex)
{
logger.log(Level.SEVERE, null, ex);
url.setStatus(UrlStatus.ERROR);
return;
}
// Parse and add new urls
Elements links = document.select("a[href]");
Set<Url> urls = new HashSet<>();
System.out.println("URL: " + url);
System.out.println("Links: " + links.size());
for (Element link : links)
{
System.out.println("\t" + link.attr("abs:href") + " (" + link.text() + ")");
urls.add(new Url(link.attr("abs:href"), UrlStatus.NEW));
}
for (Url currentUrl : urls)
{
try
{
entityManager.persist(currentUrl);
}
catch(Exception e)
{
logger.log(Level.SEVERE, "Exception: " + url + " already exists", e);
}
}
url.setDocument(document.html());
url.setStatus(UrlStatus.SCANNED);
url.setScannedOn(new Date());
}
}