// Copyright 2007 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.google.enterprise.connector.gdata;
import java.util.logging.*;
import java.util.*;
import java.net.*;
import java.io.*;
import com.google.enterprise.connector.spi.Connector;
import com.google.enterprise.connector.spi.Property;
import com.google.enterprise.connector.spi.TraversalManager;
import com.google.enterprise.connector.spi.AuthenticationManager;
import com.google.enterprise.connector.spi.AuthorizationManager;
import com.google.enterprise.connector.spi.Session;
import com.google.enterprise.connector.spi.SpiConstants;
import com.google.enterprise.connector.spi.DocumentList;
import com.google.enterprise.connector.spi.Document;
import com.google.enterprise.connector.spi.SimpleDocument;
import com.google.enterprise.connector.spi.SimpleProperty;
import com.google.enterprise.connector.spi.Value;
import com.google.enterprise.connector.spi.RepositoryException;
import com.google.gdata.client.Service;
import com.google.gdata.client.Query;
import com.google.gdata.util.ServiceException;
import com.google.gdata.util.NotModifiedException;
import com.google.gdata.data.Feed;
import com.google.gdata.data.Entry;
import com.google.gdata.data.DateTime;
import com.google.gdata.data.Category;
import com.google.gdata.data.Person;
import com.google.gdata.data.Content;
import com.google.gdata.data.TextContent;
import com.google.gdata.data.MediaContent;
import com.google.gdata.data.OtherContent;
import com.google.gdata.data.media.MediaSource;
/**
* This class fills the role of Connector, Session and TraversalManager for
* a connector that can grab entries from a GData feed and packages their
* content and metadata for consumption by the connector manager. Entries are
* processed in order of increasing last modified time.
*
* A single property, feedUrl, controls the functioning of this connector.
*
* Only, public data is considered in this connector so no authentication or
* authorization is needed (or implemented).
*
* @author amsmith@google.com (Adam Smith)
*/
public class GdConnector implements Connector, Session, TraversalManager {
/**
* These strings define the key names for connector-specific meta data
* that will be packaged along with the required keys as defined in the SPI.
*/
private static final String PROPNAME_TITLE = "title";
private static final String PROPNAME_SUMMARY = "summary";
private static final String PROPNAME_CATEGORY = "category";
private static final String PROPNAME_AUTHOR = "author";
/**
* The maximum number of entries to be fetched when requesting new items
* from the feed. If more than this number of results are available,
* last-updated-time traversal order is NOT guaranteed and some updates
* can be missed.
*/
private static final int MAX_RESULTS = Integer.MAX_VALUE;
/** The logger for this class. */
private static final Logger LOGGER =
Logger.getLogger(GdConnector.class.getName());
/** The URL for the feed to be consumed. */
private URL feedUrl;
/** The GData service that will run queries on the feeds. */
private Service service;
/** Sets the URL of the feed to consume. */
public void setFeedUrl(URL feedUrl) {
if (LOGGER.isLoggable(Level.CONFIG))
LOGGER.config("FEED URL: " + feedUrl);
this.feedUrl = feedUrl;
}
/** Gets the URL of the feed to consume. */
public URL getFeedUrl() {
return feedUrl;
}
/** Sets the GData Service to be used when fetching the feed. */
public void setService(Service service) {
if (LOGGER.isLoggable(Level.CONFIG))
LOGGER.config("SERVICE: " + service.toString());
this.service = service;
}
/** Gets the GData Service to be used when fetching the feed. */
public Service getService() {
return service;
}
/** {@inheritDoc} */
public Session login() {
return this; // this class behaves as a Session also
}
/** {@inheritDoc} */
public AuthenticationManager getAuthenticationManager() {
return null; // no authentication used
}
/** {@inheritDoc} */
public AuthorizationManager getAuthorizationManager() {
return null; // no authorization used
}
/** {@inheritDoc} */
public TraversalManager getTraversalManager() {
return this; // this class behaves as a TraversalManager also
}
/**
* This connector ignores the batch size hint.
*
* The GData reference states that result ordering is up to the
* implementation. Therefore, we fetch ALL new items every time to ensure
* that we can traverse them in order of last update.
*
* http://code.google.com/apis/gdata/reference.html#Queries
*/
public void setBatchHint(int batchHint) {
return;
}
/**
* {@inheritDoc}
*/
public DocumentList startTraversal() throws RepositoryException {
return fetchResults(null);
}
/**
* {@inheritDoc}
*
* In this connector, the checkpoint is the ISO 8601 formatted date for
* which entries in the feed with a earlier update date will attempt to be
* ignored.
*
* @param checkPoint ifModifiedSince date
*/
public DocumentList resumeTraversal(String checkPoint)
throws RepositoryException {
DateTime ifModifiedSince = DateTime.parseDateTime(checkPoint);
if (LOGGER.isLoggable(Level.INFO))
LOGGER.info("Using ifModifiedSince of " + ifModifiedSince);
return fetchResults(ifModifiedSince);
}
/**
* Builds a sorted DocumentList of all of the entries from the this
* connector's feed, as fetched by the service.
*
* @param ifModifiedSince a DateTime (@see resumeTraversal)
* @return a DocumentList of entries from the feed
*/
private DocumentList fetchResults (DateTime ifModifiedSince)
throws RepositoryException {
List documents = new LinkedList();
DateTime fetchTime = DateTime.now();
Query query = new Query(feedUrl);
query.setMaxResults(MAX_RESULTS);
if (ifModifiedSince != null) {
// The use of ifModifiedSince here filters out entries that were
// modified before the given date. Logically, we only care about those
// entries that were modified recently.
query.setUpdatedMin(ifModifiedSince);
}
try {
// The use of ifModifiedSince here tells the server this can avoid
// returning a result feed if the feed contained only entries that
// have been modified after the given date. Without this, when there
// are no changes, we would still have all of the overhead of fetching
// the feed's meta data but get zero entries. In terms of efficiency,
// we don't care about the feed unless it is going to tell us something
// new.
Feed feed = (Feed) service.query(query, Feed.class, ifModifiedSince);
List entries = feed.getEntries();
LOGGER.info("Fetched " + entries.size() + " of " +
feed.getTotalResults() + " total updated entries.");
Collections.sort(entries, new EntryUpdatedAscendingComparator());
for (ListIterator ei = entries.listIterator(); ei.hasNext();) {
Entry entry = (Entry) ei.next();
documents.add(makeDocument(entry));
}
} catch (NotModifiedException nme) {
// silently return empty result set
if (LOGGER.isLoggable(Level.INFO))
LOGGER.info(nme.toString());
} catch (IOException ioe) {
throw new RepositoryException(ioe);
} catch (ServiceException se) {
throw new RepositoryException(se);
}
return new GdDocumentList(documents, fetchTime.toString());
}
/** Makes a Document from the given Entry object. */
public static Document makeDocument (Entry entry)
throws RepositoryException {
Map map = new HashMap();
// Copy required properties from entry.
map.put(SpiConstants.PROPNAME_DOCID,
makeProperty(entry.getId()));
map.put(SpiConstants.PROPNAME_LASTMODIFIED,
makeProperty(entry.getUpdated().toStringRfc822()));
map.put(SpiConstants.PROPNAME_DISPLAYURL,
makeProperty(entry.getHtmlLink().getHref()));
// Build PROPNAME_MIMETYPE and PROPNAME_CONTENT from entry.
Content content = entry.getContent();
if (content instanceof TextContent) {
TextContent textContent = (TextContent) content;
map.put(SpiConstants.PROPNAME_MIMETYPE,
makeProperty("text/html"));
map.put(SpiConstants.PROPNAME_CONTENT,
makeProperty(
"<html><head><title>" +
(entry.getTitle() == null ? "" :
entry.getTitle().getPlainText()) +
"</title><body>" +
textContent.getContent().getPlainText() +
"</body></html>"));
} else if (content instanceof MediaContent) {
MediaContent mediaContent = (MediaContent) content;
map.put(SpiConstants.PROPNAME_MIMETYPE,
makeProperty(mediaContent.getMimeType().getMediaType() ));
map.put(SpiConstants.PROPNAME_CONTENT,
makeProperty(mediaContent.getMediaSource()));
} else if (content instanceof OtherContent) {
OtherContent otherContent = (OtherContent) content;
map.put(SpiConstants.PROPNAME_MIMETYPE,
makeProperty(otherContent.getMimeType().getMediaType() ));
map.put(SpiConstants.PROPNAME_CONTENT,
makeProperty(otherContent.getBytes() ));
} else {
if (LOGGER.isLoggable(Level.SEVERE))
LOGGER.severe("Unhandled content: " + content);
throw new RepositoryException();
}
// Extract additional properties to be sent as meta data.
// (not defined by SpiConstants)
if (entry.getTitle() != null)
map.put(PROPNAME_TITLE,
makeProperty(entry.getTitle().getPlainText()));
if (entry.getSummary() != null)
map.put(PROPNAME_SUMMARY,
makeProperty(entry.getSummary().getPlainText()));
// these are strings like "http://schemas.google.com/g/2005#event"
if (entry.getCategories() != null) {
List categoryList = new LinkedList();
for (Iterator ci = entry.getCategories().iterator(); ci.hasNext(); ) {
Category category = (Category) ci.next();
categoryList.add( Value.getStringValue(category.getTerm()) );
}
map.put(PROPNAME_CATEGORY, new SimpleProperty(categoryList));
}
if (entry.getAuthors() != null) {
List authorList = new LinkedList();
for (Iterator ai = entry.getAuthors().iterator(); ai.hasNext(); ) {
Person person = (Person) ai.next();
authorList.add(Value.getStringValue(person.getName()) );
if (person.getEmail() != null) {
authorList.add(Value.getStringValue( person.getEmail()) );
}
}
map.put(PROPNAME_AUTHOR, new SimpleProperty(authorList));
}
return new SimpleDocument(map);
}
/** Make a SimpleProperty with one value using the given string. */
public static Property makeProperty(String str) {
List strList = new LinkedList();
strList.add(Value.getStringValue(str));
return new SimpleProperty(strList);
}
/**
* Make a SimpleProperty with one value using the bytes read from the given
* MediaSource.
*/
public static Property makeProperty(MediaSource source)
throws RepositoryException {
ByteArrayOutputStream stream = new ByteArrayOutputStream();
try {
MediaSource.Output.writeTo(source, stream);
} catch (IOException ioe) {
throw new RepositoryException(ioe);
}
List list = new LinkedList();
list.add(Value.getBinaryValue(stream.toByteArray()));
return new SimpleProperty(list);
}
/**
* Make a SimpleProperty with one value using the given byte array directly.
*/
public static Property makeProperty(byte [] bytes) {
List list = new LinkedList();
list.add(Value.getBinaryValue(bytes));
return new SimpleProperty(list);
}
/**
* This is a helper class to aid in the Collections.sort() call in the
* fetchResults method. The progress through a document list of entries in
* this connector is supported by GdConnectorType which returns a meaningful
* checkpoint value for documents sorted in this manner.
*/
public static class EntryUpdatedAscendingComparator implements Comparator {
public int compare(Object a, Object b) {
return ((Entry) a).getUpdated().compareTo(((Entry) b).getUpdated());
}
public boolean equals (Object a, Object b) {
return a.equals(b);
}
}
}