Package net.fortytwo.linkeddata

Source Code of net.fortytwo.linkeddata.LinkedDataCache$DataStore

package net.fortytwo.linkeddata;

import net.fortytwo.flow.rdf.RDFBuffer;
import net.fortytwo.flow.rdf.RDFSink;
import net.fortytwo.flow.rdf.SailInserter;
import net.fortytwo.flow.rdf.SesameInputAdapter;
import net.fortytwo.flow.rdf.SesameOutputAdapter;
import net.fortytwo.flow.rdf.SingleContextPipe;
import net.fortytwo.linkeddata.dereferencers.FileURIDereferencer;
import net.fortytwo.linkeddata.dereferencers.HTTPURIDereferencer;
import net.fortytwo.linkeddata.dereferencers.JarURIDereferencer;
import net.fortytwo.linkeddata.rdfizers.ImageRdfizer;
import net.fortytwo.linkeddata.rdfizers.VerbatimRdfizer;
import net.fortytwo.linkeddata.sail.LinkedDataSail;
import net.fortytwo.ripple.Ripple;
import net.fortytwo.ripple.RippleException;
import net.fortytwo.ripple.StringUtils;
import net.fortytwo.ripple.URIMap;
import org.apache.log4j.Logger;
import org.openrdf.model.URI;
import org.openrdf.model.ValueFactory;
import org.openrdf.model.impl.URIImpl;
import org.openrdf.rio.RDFFormat;
import org.openrdf.rio.RDFHandler;
import org.openrdf.rio.RDFParser;
import org.openrdf.sail.Sail;
import org.openrdf.sail.SailConnection;
import org.openrdf.sail.SailException;
import org.restlet.data.MediaType;
import org.restlet.representation.Representation;

import java.io.IOException;
import java.io.InputStream;
import java.net.URISyntaxException;
import java.util.Arrays;
import java.util.Collection;
import java.util.Comparator;
import java.util.Date;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.Map;

/**
* A manager for a dynamic set of RDF graphs collected from the Web.
* The cache uses configurable URI dereferencers and RDFizers to fetch and translate documents,
* and connects to an RDF triple store which provides a unified view of the Web of Data.
*
* @author Joshua Shinavier (http://fortytwo.net)
*/
public class LinkedDataCache {
    private static final Logger logger = Logger.getLogger(LinkedDataCache.class);

    public static final String
            CACHE_NS = "http://fortytwo.net/2012/02/linkeddata#";
    public static final URI
            CACHE_MEMO = new URIImpl(CACHE_NS + "memo"),
            CACHE_GRAPH = null// the default context is used for caching metadata

    private static final String[] NON_RDF_EXTENSIONS = {
            "123", "3dm", "3dmf", "3gp", "8bi", "aac", "ai", "aif", "app", "asf",
            "asp", "asx", "avi", "bat", "bin", "bmp", "c", "cab", "cfg", "cgi",
            "com", "cpl", "cpp", "css", "csv", "dat", "db", "dll", "dmg", "dmp",
            "doc", "drv", "drw", "dxf", "eps", "exe", "fnt", "fon", "gif", "gz",
            "h", "hqx", /*"htm", "html",*/ "iff", "indd", "ini", "iso", "java", /*"jpeg",*/
            /*"jpg",*/ "js", "jsp", "key", "log", "m3u", "mdb", "mid", "midi", "mim",
            "mng", "mov", "mp3", "mp4", "mpa", "mpg", "msg", "msi", "otf", "pct",
            "pdf", "php", "pif", "pkg", "pl", "plugin", "png", "pps", "ppt", "ps",
            "psd", "psp", "qt", "qxd", "qxp", "ra", "ram", "rar", "reg", "rm",
            "rtf", "sea", "sit", "sitx", "sql", "svg", "swf", "sys", "tar", "tif",
            "ttf", "uue", "vb", "vcd", "wav", "wks", "wma", "wmv", "wpd", "wps",
            "ws", /*"xhtml",*/ "xll", "xls", "yps", "zip"};

    private final CachingMetadata metadata;
    private final ValueFactory valueFactory;
    private final boolean useBlankNodes;

    private URIMap uriMap;
    private boolean autoCommit = true;

    private boolean derefSubjects = true;
    private boolean derefPredicates = false;
    private boolean derefObjects = true;
    private boolean derefContexts = false;

    private String acceptHeader = null;

    private CacheExpirationPolicy expirationPolicy;

    // Maps media types to Rdfizers
    private final Map<MediaType, MediaTypeInfo> rdfizers
            = new HashMap<MediaType, MediaTypeInfo>();

    // Maps URI schemes to Dereferencers
    private final Map<String, Dereferencer> dereferencers = new HashMap<String, Dereferencer>();

    private DataStore dataStore;

    /**
     * Constructs a cache with the default settings, dereferencers, and rdfizers.
     *
     * @param sail the underlying triple store for the cache
     * @return the default cache
     * @throws RippleException if construction fails for any reason
     */
    public static LinkedDataCache createDefault(final Sail sail) throws RippleException {
        LinkedDataCache cache = new LinkedDataCache(sail);

        // Add URI dereferencers.
        HTTPURIDereferencer hdref = new HTTPURIDereferencer(cache);
        for (String x : NON_RDF_EXTENSIONS) {
            hdref.blackListExtension(x);
        }
        cache.addDereferencer("http", hdref);

        cache.addDereferencer("file", new FileURIDereferencer());
        cache.addDereferencer("jar", new JarURIDereferencer());

        RDFParser.DatatypeHandling datatypeHandling;
        String p = Ripple.getConfiguration().getString(LinkedDataSail.DATATYPE_HANDLING_POLICY);
        datatypeHandling
                = p.equals("ignore")
                ? RDFParser.DatatypeHandling.IGNORE
                : p.equals("verify")
                ? RDFParser.DatatypeHandling.VERIFY
                : p.equals("normalize")
                ? RDFParser.DatatypeHandling.NORMALIZE
                : null;
        if (null == datatypeHandling) {
            throw new RippleException("no such datatype handling policy: " + p);
        }

        // Rdfizers for registered RDF formats
        // TODO: 'tmp' is a hack to avoid a poorly-understood ConcurrentModificationException
        Collection<RDFFormat> tmp = new LinkedList<RDFFormat>();
        tmp.addAll(RDFFormat.values());
        for (RDFFormat f : tmp) {
            Rdfizer r = new VerbatimRdfizer(f, datatypeHandling);
            for (String type : f.getMIMETypes()) {
                double qualityFactor = type.equals("application/rdf+xml") ? 1.0 : 0.5;
                cache.addRdfizer(new MediaType(type), r, qualityFactor);
            }
        }

        // Additional rdfizers
        Rdfizer imageRdfizer = new ImageRdfizer();
        // Mainstream EXIF-compatible image types: JPEG, TIFF
        cache.addRdfizer(MediaType.IMAGE_JPEG, imageRdfizer, 0.4);
        cache.addRdfizer(new MediaType("image/tiff"), imageRdfizer, 0.4);
        cache.addRdfizer(new MediaType("image/tiff-fx"), imageRdfizer, 0.4);
        // TODO: add an EXIF-based Rdfizer for RIFF WAV audio files

        return cache;
    }

    /**
     * @param sail underlying triple store for the cache
     * @throws RippleException if there is a configuration error
     */
    public LinkedDataCache(final Sail sail) throws RippleException {
        int capacity = Ripple.getConfiguration().getInt(LinkedDataSail.MEMORY_CACHE_CAPACITY);
        this.metadata = new CachingMetadata(capacity, sail.getValueFactory());

        this.valueFactory = sail.getValueFactory();
        useBlankNodes = Ripple.getConfiguration().getBoolean(Ripple.USE_BLANK_NODES);

        this.expirationPolicy = new DefaultCacheExpirationPolicy();

        dataStore = new DataStore() {
            public RDFSink createInputSink(final SailConnection sc) {
                return new SesameOutputAdapter(new SailInserter(sc));
            }
        };
    }

    public void setDataStore(final DataStore dataStore) {
        this.dataStore = dataStore;
    }

    /**
     * @return an application-specific mapping for URIs dereferenced by the cache, or null no mapping is used
     */
    public URIMap getURIMap() {
        return uriMap;
    }

    /**
     * Defines an application-specific mapping for URIs dereferenced by the cache, in the manner of a Web proxy.
     *
     * @param map the mapping
     */
    public void setURIMap(final URIMap map) {
        this.uriMap = map;
    }

    /**
     * @return an HTTP "Accept" header which matches the cache's collection of rdfizers
     */
    public String getAcceptHeader() {
        if (null == acceptHeader) {
            StringBuilder sb = new StringBuilder();
            boolean first = true;

            // Order from highest quality to lowest.
            Comparator<MediaTypeInfo> comparator
                    = new Comparator<MediaTypeInfo>() {
                public int compare(final MediaTypeInfo first,
                                   final MediaTypeInfo second) {
                    return first.quality < second.quality ? 1 : first.quality > second.quality ? -1 : 0;
                }
            };

            MediaTypeInfo[] array = new MediaTypeInfo[rdfizers.size()];
            rdfizers.values().toArray(array);
            Arrays.sort(array, comparator);

            for (MediaTypeInfo m : array) {
                if (first) {
                    first = false;
                } else {
                    sb.append(", ");
                }

                sb.append(m.mediaType.getName());
                double quality = m.quality;
                if (1.0 != quality) {
                    sb.append(";q=").append(quality);
                }
            }

            acceptHeader = sb.toString();
        }

        return acceptHeader;
    }

    /**
     * Associates an rdfizer with a given media type.
     *
     * @param mediaType     a MIME type, e.g. "application/rdf+xml", "image/tiff"
     * @param rdfizer       the associated rdfizer
     * @param qualityFactor a quality value ranging from 0 to 1 which expresses
     *                      the client's preference for the given media type.
     *                      This value is used for HTTP content negotiation.
     */
    public void addRdfizer(final MediaType mediaType,
                           final Rdfizer rdfizer,
                           final double qualityFactor) {
        logger.info("adding RDFizer for media type " + mediaType + ": " + rdfizer);

        if (qualityFactor <= 0 || qualityFactor > 1) {
            throw new IllegalArgumentException("quality factor must be between 0 and 1");
        }

        if (null != rdfizers.get(mediaType)) {
            logger.warn("overriding already-registered RDFizer for media type " + mediaType);
        }

        MediaTypeInfo rq = new MediaTypeInfo();
        rq.mediaType = mediaType;
        rq.quality = qualityFactor;
        rq.rdfizer = rdfizer;
        rdfizers.put(mediaType, rq);

        acceptHeader = null;
    }

    /**
     * Associates a dereferencer with a given URI scheme.
     *
     * @param scheme the name of the URI scheme (e.g. "http", "ftp", "file", "jar")
     * @param dref   the associated dereferencer
     */
    public void addDereferencer(final String scheme, final Dereferencer dref) {
        logger.info("adding dereferencer for for URI scheme " + scheme + ": " + dref);

        dereferencers.put(scheme, dref);
    }

    /**
     * Retrieves caching metadata for a URI, possibly dereferencing a document from the Web first.
     *
     * @param uri the URI to dereference
     * @param sc  a connection to a Sail
     * @return the result of the dereferencing operation
     * @throws RippleException if retrieval fails for any reason
     */
    public CacheEntry.Status retrieveUri(final URI uri,
                                         final SailConnection sc) throws RippleException {
        // Find the named graph which stores all information associated with this URI
        String graphUri = RDFUtils.findGraphUri(uri.toString());

        // Note: there is potential for a race condition if two threads access URIs with the
        // same memo concurrently.
        // However, there are no problematic outcomes for such a race condition (apart from the minor
        // concern of a document being retrieved twice) as long as the threads use different SailConnections.
        CacheEntry memo = metadata.getMemo(graphUri, sc);

        // If there is already a (non-expired) entry for this URI, just return its status.
        if (null != memo && !expirationPolicy.isExpired(uri.toString(), memo)) {
            return memo.getStatus();
        }

        // This URI should be treated as a "black box" once created;
        // it need not resemble the URI it was created from.
        String retrievalUri;

        String mapped = null == uriMap ? uri.toString() : uriMap.get(uri.toString());
        retrievalUri = RDFUtils.removeFragmentIdentifier(mapped);

        Dereferencer dref;

        try {
            dref = chooseDereferencer(retrievalUri);
        } catch (URISyntaxException e) {
            return CacheEntry.Status.InvalidUri;
        }

        if (null == dref) {
            return CacheEntry.Status.BadUriScheme;
        }

        logger.info("dereferencing <"
                + StringUtils.escapeURIString(uri.toString()) + ">");
        //+ " at location " + mapped );

        memo = new CacheEntry(CacheEntry.Status.Undetermined);
        memo.setDereferencer(dref.getClass().getName());

        // Note: from this point on, we are committed to actually dereferencing the URI,
        // and failures are explicitly stored as caching metadata.
        try {
            Representation rep;

            memo.setStatus(CacheEntry.Status.DereferencerError);
            rep = dref.dereference(retrievalUri);

            // We have the representation, now try to rdfize it.

            memo.setMediaType(rep.getMediaType());

            Rdfizer rfiz = chooseRdfizer(memo.getMediaType());
            if (null == rfiz) {
                memo.setStatus(CacheEntry.Status.BadMediaType);
                return logStatus(uri, memo);
            }

            memo.setRdfizer(rfiz.getClass().getName());

            RDFSink adder = dataStore.createInputSink(sc);
            RDFBuffer buffer = new RDFBuffer(adder);

            // Note: any context information in the source document is discarded.
            RDFSink pipe = new SingleContextPipe(buffer, valueFactory.createURI(graphUri), valueFactory);

            RDFHandler handler = new SesameInputAdapter(useBlankNodes
                    ? pipe
                    : new BNodeToURIFilter(pipe, valueFactory));

            InputStream is;
            try {
                is = rep.getStream();
            } catch (IOException e) {
                throw new RippleException(e);
            }

            // Use the namespace portion of the original URI as the base URI for the retrieved RDF document.
            String baseUri = uri.getNamespace();

            memo.setStatus(rfiz.rdfize(is, handler, baseUri));

            // Only update the graph in the triple store if the operation was successful.
            if (CacheEntry.Status.Success == memo.getStatus()) {
                try {
                    sc.removeStatements(null, null, null, valueFactory.createURI(graphUri));
                } catch (SailException e) {
                    throw new RippleException(e);
                }

                buffer.flush();
            }
        } finally {
            metadata.setMemo(graphUri, memo, sc);

            // an autocommit happens independently of a call to LinkedDataSail#commit
            if (autoCommit) {
                try {
                    sc.commit();
                    sc.begin();
                } catch (SailException e) {
                    throw new RippleException(e);
                }
            }

            logStatus(uri, memo);
        }

        return memo.getStatus();
    }

    /**
     * @return whether the cache commits to the triple store after each Web request
     * (true by default)
     */
    public boolean isAutoCommit() {
        return autoCommit;
    }

    /**
     * @param autoCommit whether the cache should commit to the triple store after each Web request
     *                   (true by default)
     */
    public void setAutoCommit(boolean autoCommit) {
        this.autoCommit = autoCommit;
    }

    /**
     * @param expirationPolicy the rule by which the cache determines whether a cache entry has expired.
     *                         If an entry has expired, the cache will issue a new request in order to refresh it.
     */
    public void setExpirationPolicy(CacheExpirationPolicy expirationPolicy) {
        this.expirationPolicy = expirationPolicy;
    }

    public boolean getDereferenceSubjects() {
        return derefSubjects;
    }

    public void setDereferenceSubjects(final boolean flag) {
        this.derefSubjects = flag;
    }

    public boolean getDereferencePredicates() {
        return derefPredicates;
    }

    public void setDereferencePredicates(final boolean flag) {
        this.derefPredicates = flag;
    }

    public boolean getDereferenceObjects() {
        return derefObjects;
    }

    public void setDereferenceObjects(final boolean flag) {
        this.derefObjects = flag;
    }

    public boolean getDereferenceContexts() {
        return derefContexts;
    }

    public void setDereferenceContexts(final boolean flag) {
        this.derefContexts = flag;
    }

    private CacheEntry.Status logStatus(final URI uri,
                                        final CacheEntry memo) {
        CacheEntry.Status status = memo.getStatus();

        if (CacheEntry.Status.Success != status) {
            StringBuilder msg = new StringBuilder("Failed to dereference URI <"
                    + StringUtils.escapeURIString(uri.toString()) + "> (");

            msg.append("dereferencer: ").append(memo.getDereferencer());
            msg.append(", media type: ").append(memo.getMediaType());
            msg.append(", rdfizer: ").append(memo.getRdfizer());
            msg.append("): ").append(status);

            logger.info(msg);
        }

        return status;
    }

    private Dereferencer chooseDereferencer(final String uri) throws URISyntaxException {
        String scheme = new java.net.URI(uri).getScheme();

        return dereferencers.get(scheme);
    }

    private Rdfizer chooseRdfizer(final MediaType mediaType) throws RippleException {
        MediaTypeInfo rq = rdfizers.get(mediaType);
        return (null == rq) ? null : rq.rdfizer;
    }


    private class DefaultCacheExpirationPolicy implements CacheExpirationPolicy {
        private long cacheLifetime;

        public DefaultCacheExpirationPolicy() throws RippleException {
            cacheLifetime = Ripple.getConfiguration().getLong(LinkedDataSail.CACHE_LIFETIME) * 1000;
        }

        public boolean isExpired(final String uri,
                                 final CacheEntry entry) {
            Date last = entry.getTimestamp();
            return null != last
                    && System.currentTimeMillis() - last.getTime() >= cacheLifetime;
        }
    }

    private class MediaTypeInfo {
        MediaType mediaType;
        public double quality;
        public Rdfizer rdfizer;
    }

    public interface DataStore {
        RDFSink createInputSink(SailConnection sc);
    }
}
TOP

Related Classes of net.fortytwo.linkeddata.LinkedDataCache$DataStore

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.