Package cosc561.searchengine.ejb

Source Code of cosc561.searchengine.ejb.UrlService

/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/

package cosc561.searchengine.ejb;

import cosc561.searchengine.entities.Url;
import cosc561.searchengine.enums.UrlStatus;
import java.io.IOException;
import java.util.Date;
import java.util.HashSet;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.ejb.Stateless;
import javax.persistence.EntityManager;
import javax.persistence.PersistenceContext;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
*
* @author jraymond
*/
@Stateless
public class UrlService
{
    static final Logger logger = Logger.getLogger(UrlService.class.getName());
    @PersistenceContext(unitName = "SearchEnginePU")
    private EntityManager entityManager;

    public void runUrlScanner()
    {
        Url url = entityManager.createNamedQuery("Url.findByStatus", Url.class).setParameter("status", UrlStatus.NEW).setMaxResults(1).getSingleResult();
        Document document = null;

        url.setStatus(UrlStatus.SCANNING);
        entityManager.flush();

        // Test url if valid
        try
        {
            document = Jsoup.connect(url.getId()).get();
        }
        catch (IOException ex)
        {
            logger.log(Level.SEVERE, null, ex);
            url.setStatus(UrlStatus.ERROR);
            return;
        }

        // Parse and add new urls
        Elements links = document.select("a[href]");
        Set<Url> urls = new HashSet<>();

        System.out.println("URL: " + url);
        System.out.println("Links: " + links.size());
        for (Element link : links)
        {
            System.out.println("\t" + link.attr("abs:href") + " (" + link.text() + ")");
            urls.add(new Url(link.attr("abs:href"), UrlStatus.NEW));
        }

        for (Url currentUrl : urls)
        {
            try
            {
                entityManager.persist(currentUrl);
            }
            catch(Exception e)
            {
                logger.log(Level.SEVERE, "Exception: " + url + " already exists", e);
            }
        }

        url.setDocument(document.html());
        url.setStatus(UrlStatus.SCANNED);
        url.setScannedOn(new Date());
    }
}
TOP

Related Classes of cosc561.searchengine.ejb.UrlService

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.