Package net.sf.jpluck.handlers

Source Code of net.sf.jpluck.handlers.FeedHandler

package net.sf.jpluck.handlers;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Date;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.sax.SAXResult;

import net.sf.jpluck.ClientConfiguration;
import net.sf.jpluck.feed.AtomFeed;
import net.sf.jpluck.feed.FeedFormatter;
import net.sf.jpluck.feed.FeedItem;
import net.sf.jpluck.feed.RSSFeed;
import net.sf.jpluck.jxl.Feed;
import net.sf.jpluck.jxl.URIRewriter;
import net.sf.jpluck.plucker.Document;
import net.sf.jpluck.plucker.TextRecord;
import net.sf.jpluck.plucker.parsing.XHTMLEntityResolver;
import net.sf.jpluck.plucker.parsing.html.HTMLSerializer;
import net.sf.jpluck.spider.Resource;
import net.sf.jpluck.xml.TextRecordResult;

import org.xml.sax.EntityResolver;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;


public class FeedHandler extends ContentHandler {
  public FeedHandler(Document pluckerDocument, net.sf.jpluck.jxl.Document jxlDocument, Resource resource) {
    super(pluckerDocument, jxlDocument, resource);
  }

  public void handle() throws HandlingException {
    try {
      net.sf.jpluck.jxl.Feed settings = (Feed) jxlDocument;
     
      org.w3c.dom.Document dom = parseDocument(resource.getData());
      String rootName = dom.getDocumentElement().getLocalName();
      net.sf.jpluck.feed.Feed feed = null;
      if (rootName.equals("feed")) {
        // Atom feed
        feed = new AtomFeed(dom);
      } else {   
        feed = new RSSFeed(dom);
      }
      FeedFormatter formatter = new FeedFormatter(feed, settings);
      boolean generateBookmarks = ClientConfiguration.getDefault().isGenerateBookmarks();
      if (settings.isUsePageStructure()) {
        FeedItem[] items = feed.getItems();
        for (int i = 0, n = items.length; i < n; i++) {
          String uri = resource.getURI() + "_item-" + (i + 1);
          TextRecord textRecord = new TextRecord(uri,
                               jxlDocument.getOutputEncoding(),
                               jxlDocument.isUseHiresMargins());
          HTMLSerializer serializer = new HTMLSerializer(textRecord, jxlDocument.getURIRewriter(),
                                   settings.getTextColorBrightness(), settings.isParseTables(),
                                   pluckerDocument);
          Transformer transformer = TransformerFactory.newInstance().newTransformer();
          transformer.transform(new DOMSource(formatter.getItemDocuments()[i]), new SAXResult(serializer));
          pluckerDocument.addRecord(serializer.getDataRecord());
         
          String[] pageURIs = serializer.getLinkURIs();
          for (int j = 0; j < pageURIs.length; j++) {
            if (!pageURIs[j].startsWith(resource.getURI() + "_item-")) {
              addPageLink(pageURIs[j]);
            }
          }

          String[] imageURIs = serializer.getEmbeddedImageURIs();
          for (int j = 0; j < imageURIs.length; j++) {
            addImageLink(imageURIs[j]);
          }

          if (generateBookmarks) {
            addBookmark(items[i], formatter, uri);         
          }
        }
      } else {
        FeedItem[] items = feed.getItems();
        for (int i = 0; i < items.length; i++) {
          FeedItem item = items[i];
          URIRewriter rewriter = jxlDocument.getURIRewriter();
          String uri = rewriter.rewrite(item.getURL());
          if (generateBookmarks && settings.isUseLinkStructure()) {
            addBookmark(items[i], formatter, uri);
          }
        }
      }

      Transformer transformer = TransformerFactory.newInstance().newTransformer();
      TextRecord textRecord = resource.createTextRecord(jxlDocument.getOutputEncoding(),
                                jxlDocument.isUseHiresMargins());
      TextRecordResult result = new TextRecordResult(textRecord, jxlDocument.getURIRewriter(),
                               settings.getTextColorBrightness(), settings.isParseTables(),
                               pluckerDocument);
      transformer.transform(new DOMSource(formatter.getIndexDocument()), result);
      pluckerDocument.addRecord(textRecord);
      if (settings.isUseLinkStructure() || settings.isUseListStructure()) {
        String[] pageURIs = result.getLinkURIs();
        for (int i = 0; i < pageURIs.length; i++) {
          addPageLink(pageURIs[i]);
        }
      }

      String[] imageURIs = result.getEmbeddedImageURIs();
      for (int i = 0; i < imageURIs.length; i++) {
        addImageLink(imageURIs[i]);
      }
      if (ClientConfiguration.getDefault().isUseFeedDate() && (feed.getItems().length > 0)) {
        Date date = feed.getItems()[0].getDateIssued();
        if (date != null) {
          pluckerDocument.setCreationDate(date);
          pluckerDocument.setModificationDate(date);
          pluckerDocument.setLastBackupDate(date);
        }
      }
      pluckerDocument.setCategories(jxlDocument.getCategories());
    } catch (Exception e) {
      throw new HandlingException(e);
    }
  }
 
  private void addBookmark(FeedItem item, FeedFormatter formatter, String uri) {
    if (uri != null) {
      String title = item.getTitle();
      if (title == null) {
        title = "<no title>";
      }
      pluckerDocument.addBookmark(title, uri);         
    }
  }
 
  private org.w3c.dom.Document parseDocument(byte[] data) throws Exception {
    DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
    dbf.setNamespaceAware(true);
    dbf.setValidating(false);

    DocumentBuilder builder = dbf.newDocumentBuilder();
    builder.setEntityResolver(new EntityResolver() {
        public InputSource resolveEntity(String publicId, String systemId)
          throws SAXException, IOException {
          return XHTMLEntityResolver.getTransitionalDTD();
        }
      });

    org.w3c.dom.Document document = null;
    try {
      document = builder.parse(new InputSource(new ByteArrayInputStream(data)));
    } catch (Exception e) {
      InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream(data), "ISO-8859-1");
      document = builder.parse(new InputSource(reader));
    }
    return document;
  }
}
TOP

Related Classes of net.sf.jpluck.handlers.FeedHandler

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.