/*
* OREIngestionCrosswalk.java
*
* Version: $Revision: 1 $
*
* Date: $Date: 2007-07-30 12:26:50 -0500 (Mon, 30 Jul 2007) $
*
* Copyright (c) 2002-2005, Hewlett-Packard Company and Massachusetts
* Institute of Technology. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* - Neither the name of the Hewlett-Packard Company nor the name of the
* Massachusetts Institute of Technology nor the names of their
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
* TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
* DAMAGE.
*/
package org.dspace.content.crosswalk;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.ConnectException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLEncoder;
import java.sql.SQLException;
import java.text.DateFormat;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Calendar;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.Stack;
import java.util.TreeMap;
import org.apache.log4j.Logger;
import org.dspace.authorize.AuthorizeException;
import org.dspace.content.Bitstream;
import org.dspace.content.BitstreamFormat;
import org.dspace.content.Bundle;
import org.dspace.content.DCValue;
import org.dspace.content.DSpaceObject;
import org.dspace.content.FormatIdentifier;
import org.dspace.content.Item;
import org.dspace.content.MetadataSchema;
import org.dspace.content.packager.PackageDisseminator;
import org.dspace.content.packager.PackageException;
import org.dspace.content.packager.PackageParameters;
import org.dspace.core.Constants;
import org.dspace.core.Context;
import org.dspace.core.PluginManager;
import org.dspace.core.ConfigurationManager;
import org.dspace.core.Utils;
import org.jdom.Attribute;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.Namespace;
import org.jdom.input.SAXBuilder;
import org.jdom.output.Format;
import org.jdom.output.XMLOutputter;
import org.jdom.xpath.XPath;
/**
* ORE ingestion crosswalk
* <p>
* Processes an Atom-encoded ORE resource map and attemps to interpret it as a DSpace item
*
* @author Alexey Maslov
* @version $Revision: 1 $
*/
public class OREIngestionCrosswalk
implements IngestionCrosswalk
{
/** log4j category */
private static Logger log = Logger.getLogger(OREDisseminationCrosswalk.class);
/* Namespaces */
public static final Namespace ATOM_NS =
Namespace.getNamespace("atom", "http://www.w3.org/2005/Atom");
private static final Namespace ORE_ATOM =
Namespace.getNamespace("oreatom", "http://www.openarchives.org/ore/atom/");
private static final Namespace ORE_NS =
Namespace.getNamespace("ore", "http://www.openarchives.org/ore/terms/");
private static final Namespace RDF_NS =
Namespace.getNamespace("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#");
private static final Namespace DCTERMS_NS =
Namespace.getNamespace("dcterms", "http://purl.org/dc/terms/");
private static final Namespace DS_NS =
Namespace.getNamespace("ds","http://www.dspace.org/objectModel/");
public void ingest(Context context, DSpaceObject dso, List metadata) throws CrosswalkException, IOException, SQLException, AuthorizeException {
// If this list contains only the root already, just pass it on
List<Element> elements = metadata;
if (elements.size() == 1) {
ingest(context, dso, elements.get(0));
}
// Otherwise, wrap them up
else {
Element wrapper = new Element("wrap",elements.get(0).getNamespace());
wrapper.addContent(elements);
ingest(context,dso,wrapper);
}
}
public void ingest(Context context, DSpaceObject dso, Element root) throws CrosswalkException, IOException, SQLException, AuthorizeException {
Date timeStart = new Date();
if (dso.getType() != Constants.ITEM)
throw new CrosswalkObjectNotSupported("OREIngestionCrosswalk can only crosswalk an Item.");
Item item = (Item)dso;
if (root == null) {
System.err.println("The element received by ingest was null");
return;
}
Document doc = new Document();
doc.addContent(root.detach());
XPath xpathLinks;
List<Element> aggregatedResources;
String entryId;
try {
xpathLinks = XPath.newInstance("/atom:entry/atom:link[@rel=\"" + ORE_NS.getURI()+"aggregates" + "\"]");
xpathLinks.addNamespace(ATOM_NS);
aggregatedResources = xpathLinks.selectNodes(doc);
xpathLinks = XPath.newInstance("/atom:entry/atom:link[@rel='alternate']/@href");
xpathLinks.addNamespace(ATOM_NS);
entryId = ((Attribute)xpathLinks.selectSingleNode(doc)).getValue();
} catch (JDOMException e) {
throw new CrosswalkException("JDOM exception occured while ingesting the ORE");
}
// Next for each resource, create a bitstream
XPath xpathDesc;
NumberFormat nf=NumberFormat.getInstance();
nf.setGroupingUsed(false);
nf.setMinimumIntegerDigits(4);
int countInt=0;
String count;
for (Element resource : aggregatedResources)
{
countInt++;
count = nf.format((long)countInt);
String href = resource.getAttributeValue("href");
log.debug("ORE processing: " + href);
String bundleName;
Element desc = null;
try {
xpathDesc = XPath.newInstance("/atom:entry/oreatom:triples/rdf:Description[@rdf:about=\"" + this.URLencode(href) + "\"][1]");
xpathDesc.addNamespace(ATOM_NS);
xpathDesc.addNamespace(ORE_ATOM);
xpathDesc.addNamespace(RDF_NS);
desc = (Element)xpathDesc.selectSingleNode(doc);
} catch (JDOMException e) {
e.printStackTrace();
}
if (desc != null && desc.getChild("type", RDF_NS).getAttributeValue("resource", RDF_NS).equals(DS_NS.getURI() + "DSpaceBitstream"))
{
bundleName = desc.getChildText("description", DCTERMS_NS);
log.debug("Setting bundle name to: " + bundleName);
}
else {
log.info("Could not obtain bundle name; using 'ORIGINAL'");
bundleName = "ORIGINAL";
}
// Bundle names are not unique, so we just pick the first one if there's more than one.
Bundle[] targetBundles = item.getBundles(bundleName);
Bundle targetBundle;
// if null, create the new bundle and add it in
if (targetBundles.length == 0) {
targetBundle = item.createBundle(bundleName);
item.addBundle(targetBundle);
}
else {
targetBundle = targetBundles[0];
}
URL ARurl = null;
InputStream in = null;
if (href != null) {
try {
// Make sure the url string escapes all the oddball characters
String processedURL = URLencode(href);
// Generate a requeset for the aggregated resource
ARurl = new URL(processedURL);
in = ARurl.openStream();
}
catch(FileNotFoundException fe) {
log.error("The provided URI failed to return a resource: " + href);
}
catch(ConnectException fe) {
log.error("The provided URI was invalid: " + href);
}
}
else {
throw new CrosswalkException("Entry did not contain link to resource: " + entryId);
}
// ingest and update
if (in != null) {
Bitstream newBitstream = targetBundle.createBitstream(in);
String bsName = resource.getAttributeValue("title");
newBitstream.setName(bsName);
// Identify the format
String mimeString = resource.getAttributeValue("type");
BitstreamFormat bsFormat = BitstreamFormat.findByMIMEType(context, mimeString);
if (bsFormat == null) {
bsFormat = FormatIdentifier.guessFormat(context, newBitstream);
}
newBitstream.setFormat(bsFormat);
newBitstream.update();
targetBundle.addBitstream(newBitstream);
targetBundle.update();
}
else {
throw new CrosswalkException("Could not retrieve bitstream: " + entryId);
}
}
log.info("OREIngest for Item "+ item.getID() + " took: " + (new Date().getTime() - timeStart.getTime()) + "ms.");
}
/**
* Helper method to escape all chaacters that are not part of the canon set
* @param sourceString source unescaped string
*/
private String URLencode(String sourceString) {
Character lowalpha[] = {'a' , 'b' , 'c' , 'd' , 'e' , 'f' , 'g' , 'h' , 'i' ,
'j' , 'k' , 'l' , 'm' , 'n' , 'o' , 'p' , 'q' , 'r' ,
's' , 't' , 'u' , 'v' , 'w' , 'x' , 'y' , 'z'};
Character upalpha[] = {'A' , 'B' , 'C' , 'D' , 'E' , 'F' , 'G' , 'H' , 'I' ,
'J' , 'K' , 'L' , 'M' , 'N' , 'O' , 'P' , 'Q' , 'R' ,
'S' , 'T' , 'U' , 'V' , 'W' , 'X' , 'Y' , 'Z'};
Character digit[] = {'0' , '1' , '2' , '3' , '4' , '5' , '6' , '7' , '8' , '9'};
Character mark[] = {'-' , '_' , '.' , '!' , '~' , '*' , '\'' , '(' , ')'};
// reserved
Character reserved[] = {';' , '/' , '?' , ':' , '@' , '&' , '=' , '+' , '$' , ',' ,'%', '#'};
Set<Character> URLcharsSet = new HashSet<Character>();
URLcharsSet.addAll(Arrays.asList(lowalpha));
URLcharsSet.addAll(Arrays.asList(upalpha));
URLcharsSet.addAll(Arrays.asList(digit));
URLcharsSet.addAll(Arrays.asList(mark));
URLcharsSet.addAll(Arrays.asList(reserved));
String processedString = new String();
for (int i=0; i<sourceString.length(); i++) {
char ch = sourceString.charAt(i);
if (URLcharsSet.contains(ch)) {
processedString += ch;
}
else {
processedString += "%" + Integer.toHexString((int)ch);
}
}
return processedString;
}
}