Package net.sf.jpluck.spider

Source Code of net.sf.jpluck.spider.Parser

package net.sf.jpluck.spider;

import net.sf.jpluck.handlers.ContentHandler;
import net.sf.jpluck.handlers.HandlingException;
import java.util.logging.Logger;

abstract class Parser implements Runnable {
    protected final Logger logger = Logger.getLogger("parser");

    protected Spider spider;
    protected Resource resource;

    protected Parser(Spider spider, Resource resource) {
        this.spider = spider;
        this.resource=resource;
    }

    public void run() {
        try {
            if (!spider.isRunning()) {
                return;
            }
            spider.fireParsingStarted(resource.getURI());
            ContentHandler contentHandler = getContentHandler();
            if (contentHandler == null) {
                logger.warning(resource.getMimeType() + " content not handled.");
                return;
            }
            contentHandler.handle();
            String[] pageLinks = contentHandler.getPageLinks();
            for (int i = 0; i < pageLinks.length; i++) {
                String link = pageLinks[i];
                spider.retrieve(link, LinkFilter.EXTERNAL_LINK, resource.getLevel() + 1, resource.getURI());
            }
            String[] imageLinks = contentHandler.getImageLinks();
            for (int i = 0; i < imageLinks.length; i++) {
                String image = imageLinks[i];
                spider.retrieve(image, LinkFilter.EMBEDDED_IMAGE_LINK, resource.getLevel(), resource.getURI());
            }
        } catch (HandlingException e) {
          String s ="Error handling content " + resource.getURI() + ". " ;
          if (e.getCause() != null ) {
            s += e.getCause() + ": " + e.getCause().getMessage();
          }
            logger.warning(s);
      e.printStackTrace();
            throw new RuntimeException(e);
        } finally {
            spider.fireParsingCompleted(resource.getURI());
            spider.notifyTaskDone();
        }
    }

    protected abstract ContentHandler getContentHandler();
}
TOP

Related Classes of net.sf.jpluck.spider.Parser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.