Package org.open.pagehealth

Source Code of org.open.pagehealth.LinkChecker

package org.open.pagehealth; /**
* User: swapnilsapar
* Date: 5/2/11
* Time: 4:20 PM
*/

import jline.ConsoleReader;
import org.apache.log4j.Logger;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.w3c.dom.Comment;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;
import java.awt.*;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Date;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.Properties;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.RejectedExecutionHandler;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;

/**
* Example program to list links from a URL.
*/
public class LinkChecker {


    public static final String     PROPS_FILE       = "linkchecker.properties";
    public static final Properties PROPERTIES       = loadProperties();
    public static final String     DEFAULT_ROOT_URL =
            PROPERTIES.getProperty("org.open.pagehealth.default.page", "http://ebay.com");

    private static final Logger LOG                = Logger.getLogger(LinkChecker.class);
    private static final int    POOL_SIZE          = getIntProperty("org.open.pagehealth.poolsize", 10);
    public static final  int    CONNECTION_TIMEOUT = getIntProperty("org.open.pagehealth.timeout", 10000);
    public static final  int    RESULT_COL_WIDTH   = getIntProperty("org.open.pagehealth.result.width", 35);
    private static final String REPORT_FILE        =
            PROPERTIES.getProperty("org.open.pagehealth.reportfile", "report.html");

    public static final String PROXY = PROPERTIES.getProperty("org.open.pagehealth.proxy", null);

    public static final String USER_AGENT_STRING =
            PROPERTIES.getProperty("org.open.pagehealth.user-agent", "Mozilla/5.0");

    public static final boolean HEAD_REQUEST = getBooleanProperty("org.open.pagehealth.quick-check-head", false);

    private PageLink _rootPage;
    private long     _scanTime, _startTime;

    public static Properties loadProperties() {
        //System.setProperty("log4j.debug", "true");
        Properties props = null;
        final File propsFile = new File(PROPS_FILE);
        try {
            if (!propsFile.exists()) {
                propsFile.createNewFile();
                System.out.println("New File created. " + propsFile.getAbsolutePath());
            }
            props = new Properties();
            props.load(new FileInputStream(PROPS_FILE));
            System.out.println("Using properties file: " + propsFile.getAbsolutePath());
            System.setProperty("log4j.configuration", propsFile.toURI().toString());
        } catch (Exception exp) {
            exp.printStackTrace();
        }
        return props;
    }

    public static int getIntProperty(final String key, final int defaultValue) {
        final String strValue = PROPERTIES.getProperty(key, null);
        if (null == strValue) {
            return defaultValue;
        }
        try {
            return Integer.parseInt(strValue);
        } catch (Exception exp) {
            LOG.error("Invalid property value (un-parsable) value:" + strValue + ", key=" + key, exp);
            return defaultValue;
        }
    }

    public static boolean getBooleanProperty(final String key, final boolean defaultValue) {
        final String strValue = PROPERTIES.getProperty(key, null);
        if (null == strValue) {
            return defaultValue;
        }
        try {
            return Boolean.parseBoolean(strValue);
        } catch (Exception exp) {
            LOG.error("Invalid property value (un-parsable) value:" + strValue + ", key=" + key, exp);
            return defaultValue;
        }
    }


    public static void main(final String[] args) throws Exception {
        String rootURL = PROPERTIES.getProperty("org.open.pagehealth.page", null);
        if (rootURL == null) {
            rootURL = getPageUrlFromConsole();
        }
        new LinkChecker(validateURL(rootURL));
    }

    public LinkChecker(final String rootLink) {
        init(rootLink);
    }

    public void init(final String rootLink) {
        _rootPage = new PageLink(rootLink, "Root", "Root");
        _startTime = System.currentTimeMillis();
        final HashMap<String, PageLink> pageLinks = getLinksNodes(_rootPage);
        checkLinks(pageLinks);
        _scanTime = (System.currentTimeMillis() - _startTime);
        LOG.info("Scan duration: " + _scanTime + " ms with " + POOL_SIZE + " threads.");
        printResult(pageLinks);
    }

    public static String validateURL(String rootURL) {

        if (rootURL == null || rootURL.trim().length() < 1) {
            return null;
        }
        try {
            if (!rootURL.startsWith("http")) {
                rootURL = "http://" + rootURL;
            }
            final URL userURL = new URL(rootURL);
            userURL.getContent();
            return userURL.toString();
        } catch (MalformedURLException exp) {
            LOG.error("Invalid URL entered. Message: " + exp.getMessage());
        } catch (IOException exp) {
            LOG.error("Invalid URL entered. Message: ", exp);
        }
        return null;
    }

    public static String getPageUrlFromConsole() {
        try {
            final ConsoleReader reader = new ConsoleReader();
            System.out.print("Enter a website URL [" + DEFAULT_ROOT_URL + "]?: ");

            String line = reader.readLine();
            if (line.trim().length() == 0) {
                line = DEFAULT_ROOT_URL;
            }
            line = validateURL(line);
            if (null != line) {
                return line;
            }
            System.out.println("Please try again...");
            return getPageUrlFromConsole();

        } catch (IOException exp) {
            LOG.error("Invalid URL entered. Message: ", exp);
        }
        System.out.println("Please try again...");
        return getPageUrlFromConsole();
    }


    public HashMap<String, PageLink> getLinksNodes(final PageLink checkPage) {
        final long startTime = System.currentTimeMillis();
        try {
            LOG.info("Scanning page: " + checkPage.getURL());
            final Connection conn = Jsoup.connect(checkPage.getURL());
            final Document doc = conn.get();
            final Elements links = doc.select("a[href]");
            final Elements media = doc.select("[src]");
            final Elements imports = doc.select("link[href]");
            LOG.info("Page contains: Links: " + links.size() + ", Media: " + media.size() + ", " + "Imports: " +
                             imports.size());
            HashMap<String, PageLink> pageLinks =
                    new HashMap<String, PageLink>(links.size() + media.size() + imports.size());
            pageLinks = appendElements(pageLinks, links, "abs:href");
            pageLinks = appendElements(pageLinks, media, "abs:src");
            pageLinks = appendElements(pageLinks, imports, "abs:href");

            LOG.debug("Root page scan took " + (System.currentTimeMillis() - startTime) + " ms");
            return pageLinks;
        } catch (Exception exp) {
            exp.printStackTrace();
        }
        return null;
    }

    private HashMap<String, PageLink> appendElements(final HashMap<String, PageLink> pageLinkList,
                                                     final Elements elem,
                                                     final String attrKey) {
        for (final Element pageElement : elem) {
            final String linkTarget = pageElement.attr(attrKey);
            if (linkTarget == null || linkTarget.trim().length() < 1) {
                final String href = pageElement.attr("href");
                if (null != href && href.startsWith("javascript:")) {
                    LOG.debug("Skipping javascript link: " + pageElement);
                    continue;
                }
                LOG.error("Empty link found" + pageElement);
            }
            if (linkTarget.startsWith("mailto")) {
                LOG.info("ignoring mailto link: " + pageElement);
                continue;
            }
            final String caption = pageElement.hasText() ? pageElement.text() : pageElement.attr("alt");
            pageLinkList.put(linkTarget,
                             new PageLink(linkTarget, trim(caption, RESULT_COL_WIDTH),
                                          pageElement.tag().toString()));
            //print(" * a: <%s>  (%s)", li, trim(link.text(), 35));
        }
        return pageLinkList;
    }

    public void checkLinks(final HashMap<String, PageLink> pageLinks) {


        if (null == pageLinks || pageLinks.size() < 1) {
            LOG.error("No pageLinks found on the page");
            return;
        }
        /*
        for (org.open.pagehealth.PageLink node : pageLinks) {
            Thread th = new Thread(new org.open.pagehealth.ClickThread(node));
            th.start();
        }*/


        final BlockingQueue<Runnable> worksQueue = new ArrayBlockingQueue<Runnable>(pageLinks.size());
        final RejectedExecutionHandler executionHandler = new TaskOverflowHandler();
        LOG.info("Starting Pool: Threads: " + POOL_SIZE);
        // Create the ThreadPoolExecutor
        final ThreadPoolExecutor executor =
                new ThreadPoolExecutor(POOL_SIZE, POOL_SIZE, 3, TimeUnit.SECONDS, worksQueue, executionHandler);
        executor.allowCoreThreadTimeOut(true);

        final Object callback = new Object();

        // Starting the monitor thread as a daemon
        final Thread monitor = new Thread(new TasksMonitorThread(executor, callback), "TasksMonitorThread");
        monitor.setDaemon(true);
        monitor.start();
        final Iterator<PageLink> pageLinkIterator = pageLinks.values().iterator();
        int counter = 1;
        // Adding the tasks
        while (pageLinkIterator.hasNext()) {
            final PageLink pLink = pageLinkIterator.next();
            executor.execute(new ClickThread("" + counter++, pLink));
        }
        try {
            synchronized (callback) {
                LOG.debug("Going to sleep until all tasks are complete");
                callback.wait();
            }
        } catch (Exception exp) {
            LOG.error("Exception in wait", exp);
        }
        executor.shutdown();
        LOG.debug("Is pool stopped: " + executor.isShutdown());
        LOG.info("Pool shutdown.");
    }

    public void printResult(final HashMap<String, PageLink> pageLinks) {
        try {
            /////////////////////////////
            //Creating an empty XML Document

            //We need a Document
            final DocumentBuilderFactory dbfac = DocumentBuilderFactory.newInstance();
            final DocumentBuilder docBuilder = dbfac.newDocumentBuilder();
            final org.w3c.dom.Document doc = docBuilder.newDocument();

            ////////////////////////
            //Creating the XML tree

            //create the root element and add it to the document
            final org.w3c.dom.Element rootResultElement = doc.createElement("result");
            doc.appendChild(rootResultElement);

            //create a comment and put it in the root element
            final Comment comment = doc.createComment("LinkChecker application report file");
            rootResultElement.appendChild(comment);

            //create child element, add an attribute, and add to root
            final org.w3c.dom.Element pageElement = doc.createElement("page");
            pageElement.setAttribute("url", _rootPage.getURL());
            rootResultElement.appendChild(pageElement);

            // links node
            final org.w3c.dom.Element linksElement = doc.createElement("links");

            int goodLinks = 0;

            final Hashtable<String, Integer> threadInfo = new Hashtable<String, Integer>();

            for (final PageLink page : pageLinks.values()) {

                if (page.isGood()) { goodLinks++; }

                final String thName = page.getVerifiedThread();
                if (threadInfo.containsKey(thName)) {
                    Integer jobCount = threadInfo.get(thName);
                    jobCount = jobCount + 1;
                    threadInfo.put(thName, jobCount);
                } else {
                    threadInfo.put(thName, 1);
                }

                final org.w3c.dom.Element linkElement = doc.createElement("link");


                final org.w3c.dom.Element shortUrlElement2 = doc.createElement("short-url");
                shortUrlElement2.appendChild(doc.createTextNode(page.getShortURL()));
                linkElement.appendChild(shortUrlElement2);

                final org.w3c.dom.Element urlElement = doc.createElement("url");
                urlElement.appendChild(doc.createTextNode(page.getURL()));
                linkElement.appendChild(urlElement);

                final org.w3c.dom.Element typeElement = doc.createElement("type");
                typeElement.appendChild(doc.createTextNode(page.getType()));
                linkElement.appendChild(typeElement);

                final org.w3c.dom.Element captionElement = doc.createElement("caption");
                captionElement.appendChild(doc.createTextNode(page.getCaption()));
                linkElement.appendChild(captionElement);

                final org.w3c.dom.Element statusElement = doc.createElement("status");
                statusElement.appendChild(doc.createTextNode(page.getDisplayStatus()));
                linkElement.appendChild(statusElement);

                final org.w3c.dom.Element verifiedElement = doc.createElement("verified");
                verifiedElement.appendChild(doc.createTextNode(page.getDisplayVerifiedStatus()));
                linkElement.appendChild(verifiedElement);

                final org.w3c.dom.Element responseElement = doc.createElement("http-response");
                responseElement.appendChild(doc.createTextNode(page.getResponseStatus()));
                linkElement.appendChild(responseElement);

                final org.w3c.dom.Element contentTypeElement = doc.createElement("content-type");
                contentTypeElement.appendChild(doc.createTextNode(page.getContentType()));
                linkElement.appendChild(contentTypeElement);

                final org.w3c.dom.Element contentLengthElement = doc.createElement("content-length");
                contentLengthElement.appendChild(doc.createTextNode(page.getContentLength()));
                linkElement.appendChild(contentLengthElement);

                final org.w3c.dom.Element scanTimeElement = doc.createElement("scan-time");
                scanTimeElement.appendChild(doc.createTextNode(page.getScanTime()));
                linkElement.appendChild(scanTimeElement);

                final org.w3c.dom.Element threadElement = doc.createElement("verified-thread");
                threadElement.appendChild(doc.createTextNode(page.getVerifiedThread()));
                linkElement.appendChild(threadElement);
                linksElement.appendChild(linkElement);


            }
            pageElement.appendChild(linksElement);


            final org.w3c.dom.Element summaryElement = doc.createElement("summary");


            org.w3c.dom.Element child = doc.createElement("root-url");
            child.appendChild(doc.createTextNode(_rootPage.getURL()));
            summaryElement.appendChild(child);

            child = doc.createElement("link-count");
            child.appendChild(doc.createTextNode("" + pageLinks.size()));
            summaryElement.appendChild(child);

            child = doc.createElement("health");
            final String health = "" + ((float) goodLinks / pageLinks.size()) * 100;
            child.appendChild(doc.createTextNode(health));
            summaryElement.appendChild(child);

            child = doc.createElement("scan-time");
            child.appendChild(doc.createTextNode("" + _scanTime));
            summaryElement.appendChild(child);

            child = doc.createElement("start-time");
            child.appendChild(doc.createTextNode(new Date(_startTime).toString()));
            summaryElement.appendChild(child);

            child = doc.createElement("thread-count");
            child.appendChild(doc.createTextNode("" + POOL_SIZE));
            summaryElement.appendChild(child);

            pageElement.appendChild(summaryElement);


            //thread-pool node
            int max_weight = 0;

            final Enumeration<String> keys = threadInfo.keys();
            while (keys.hasMoreElements()) {

                final Integer threadCount = threadInfo.get(keys.nextElement());
                if (threadCount > max_weight) {
                    max_weight = threadCount;
                }
            }

            final org.w3c.dom.Element poolStatsElement = doc.createElement("thread-pool");
            final org.w3c.dom.Element threadsElement = doc.createElement("threads");


            final Enumeration<String> threadNames = threadInfo.keys();
            while (threadNames.hasMoreElements()) {
                final String threadName = threadNames.nextElement();
                final org.w3c.dom.Element threadElement = doc.createElement("thread");

                child = doc.createElement("name");
                child.appendChild(doc.createTextNode(threadName));
                threadElement.appendChild(child);

                child = doc.createElement("job-count");
                final Integer jobCount = threadInfo.get(threadName);
                child.appendChild(doc.createTextNode("" + jobCount));
                threadElement.appendChild(child);

                child = doc.createElement("job-share");
                final String share = "" + (float) jobCount / (max_weight) * 100;
                child.appendChild(doc.createTextNode("" + share));
                threadElement.appendChild(child);

                threadsElement.appendChild(threadElement);
            }
            poolStatsElement.appendChild(threadsElement);
            pageElement.appendChild(poolStatsElement);

            /////////////////
            //Output the XML

            //set up a transformer
            final Transformer transformer =
                    TransformerFactory.newInstance().newTransformer(new StreamSource("report-format" + ".xsl"));
            transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
            transformer.setOutputProperty(OutputKeys.INDENT, "yes");

            //create string from xml tree
            //StringWriter sw = new StringWriter();
            final File sw = new File(REPORT_FILE);
            final DOMSource source = new DOMSource(doc);
            transformer.transform(source, new StreamResult(sw));
            LOG.info("Attempting to open the report file: " + sw.getAbsolutePath());
            Desktop.getDesktop().open(sw);

        } catch (Exception exp) {
            LOG.error("Error generating report", exp);
        }
    }

    private String trim(final String s, final int width) {
        if (s.length() > width) { return s.substring(0, width - 1) + "."; } else { return s; }
    }
}
TOP

Related Classes of org.open.pagehealth.LinkChecker

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.