package net.sf.jpluck.handlers;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Date;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.sax.SAXResult;
import net.sf.jpluck.ClientConfiguration;
import net.sf.jpluck.feed.AtomFeed;
import net.sf.jpluck.feed.FeedFormatter;
import net.sf.jpluck.feed.FeedItem;
import net.sf.jpluck.feed.RSSFeed;
import net.sf.jpluck.jxl.Feed;
import net.sf.jpluck.jxl.URIRewriter;
import net.sf.jpluck.plucker.Document;
import net.sf.jpluck.plucker.TextRecord;
import net.sf.jpluck.plucker.parsing.XHTMLEntityResolver;
import net.sf.jpluck.plucker.parsing.html.HTMLSerializer;
import net.sf.jpluck.spider.Resource;
import net.sf.jpluck.xml.TextRecordResult;
import org.xml.sax.EntityResolver;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
public class FeedHandler extends ContentHandler {
public FeedHandler(Document pluckerDocument, net.sf.jpluck.jxl.Document jxlDocument, Resource resource) {
super(pluckerDocument, jxlDocument, resource);
}
public void handle() throws HandlingException {
try {
net.sf.jpluck.jxl.Feed settings = (Feed) jxlDocument;
org.w3c.dom.Document dom = parseDocument(resource.getData());
String rootName = dom.getDocumentElement().getLocalName();
net.sf.jpluck.feed.Feed feed = null;
if (rootName.equals("feed")) {
// Atom feed
feed = new AtomFeed(dom);
} else {
feed = new RSSFeed(dom);
}
FeedFormatter formatter = new FeedFormatter(feed, settings);
boolean generateBookmarks = ClientConfiguration.getDefault().isGenerateBookmarks();
if (settings.isUsePageStructure()) {
FeedItem[] items = feed.getItems();
for (int i = 0, n = items.length; i < n; i++) {
String uri = resource.getURI() + "_item-" + (i + 1);
TextRecord textRecord = new TextRecord(uri,
jxlDocument.getOutputEncoding(),
jxlDocument.isUseHiresMargins());
HTMLSerializer serializer = new HTMLSerializer(textRecord, jxlDocument.getURIRewriter(),
settings.getTextColorBrightness(), settings.isParseTables(),
pluckerDocument);
Transformer transformer = TransformerFactory.newInstance().newTransformer();
transformer.transform(new DOMSource(formatter.getItemDocuments()[i]), new SAXResult(serializer));
pluckerDocument.addRecord(serializer.getDataRecord());
String[] pageURIs = serializer.getLinkURIs();
for (int j = 0; j < pageURIs.length; j++) {
if (!pageURIs[j].startsWith(resource.getURI() + "_item-")) {
addPageLink(pageURIs[j]);
}
}
String[] imageURIs = serializer.getEmbeddedImageURIs();
for (int j = 0; j < imageURIs.length; j++) {
addImageLink(imageURIs[j]);
}
if (generateBookmarks) {
addBookmark(items[i], formatter, uri);
}
}
} else {
FeedItem[] items = feed.getItems();
for (int i = 0; i < items.length; i++) {
FeedItem item = items[i];
URIRewriter rewriter = jxlDocument.getURIRewriter();
String uri = rewriter.rewrite(item.getURL());
if (generateBookmarks && settings.isUseLinkStructure()) {
addBookmark(items[i], formatter, uri);
}
}
}
Transformer transformer = TransformerFactory.newInstance().newTransformer();
TextRecord textRecord = resource.createTextRecord(jxlDocument.getOutputEncoding(),
jxlDocument.isUseHiresMargins());
TextRecordResult result = new TextRecordResult(textRecord, jxlDocument.getURIRewriter(),
settings.getTextColorBrightness(), settings.isParseTables(),
pluckerDocument);
transformer.transform(new DOMSource(formatter.getIndexDocument()), result);
pluckerDocument.addRecord(textRecord);
if (settings.isUseLinkStructure() || settings.isUseListStructure()) {
String[] pageURIs = result.getLinkURIs();
for (int i = 0; i < pageURIs.length; i++) {
addPageLink(pageURIs[i]);
}
}
String[] imageURIs = result.getEmbeddedImageURIs();
for (int i = 0; i < imageURIs.length; i++) {
addImageLink(imageURIs[i]);
}
if (ClientConfiguration.getDefault().isUseFeedDate() && (feed.getItems().length > 0)) {
Date date = feed.getItems()[0].getDateIssued();
if (date != null) {
pluckerDocument.setCreationDate(date);
pluckerDocument.setModificationDate(date);
pluckerDocument.setLastBackupDate(date);
}
}
pluckerDocument.setCategories(jxlDocument.getCategories());
} catch (Exception e) {
throw new HandlingException(e);
}
}
private void addBookmark(FeedItem item, FeedFormatter formatter, String uri) {
if (uri != null) {
String title = item.getTitle();
if (title == null) {
title = "<no title>";
}
pluckerDocument.addBookmark(title, uri);
}
}
private org.w3c.dom.Document parseDocument(byte[] data) throws Exception {
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
dbf.setNamespaceAware(true);
dbf.setValidating(false);
DocumentBuilder builder = dbf.newDocumentBuilder();
builder.setEntityResolver(new EntityResolver() {
public InputSource resolveEntity(String publicId, String systemId)
throws SAXException, IOException {
return XHTMLEntityResolver.getTransitionalDTD();
}
});
org.w3c.dom.Document document = null;
try {
document = builder.parse(new InputSource(new ByteArrayInputStream(data)));
} catch (Exception e) {
InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream(data), "ISO-8859-1");
document = builder.parse(new InputSource(reader));
}
return document;
}
}