Package gov.lanl.adore.djatoka.io

Source Code of gov.lanl.adore.djatoka.io.ExtractorFactory

/*
* Copyright (c) 2010 Brasiliana Digital Library, 2008 Los Alamos National Security, LLC.
*
* Brasiliana Digital Library
* http://www.brasiliana.usp.br
*
* Los Alamos National Laboratory
* Research Library
* Digital Library Research & Prototyping Team
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
*/

package gov.lanl.adore.djatoka.io;

import eu.medsea.mimeutil.MimeException;
import eu.medsea.mimeutil.detector.OpendesktopMimeDetector;
import gov.lanl.adore.djatoka.DjatokaExtractProcessor;
import gov.lanl.adore.djatoka.IExtract;
import gov.lanl.adore.djatoka.kdu.KduExtractExe;
import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.Semaphore;
import java.util.concurrent.TimeUnit;

import org.apache.log4j.Logger;
import org.apache.log4j.Priority;

/**
* Extractor Factory. Uses format writer/reader implementations.
* @author Fabio Kepler
*
*/
public class ExtractorFactory implements FormatConstants {
    static Logger logger = Logger.getLogger(ExtractorFactory.class);


    // Mimetypes for supported extractor formats
    /** JP2 Mimetype Constant - "image/jp2" */
    // public static final String FORMAT_MIMEYPE_JP2 = "image/jp2";
    /** JPEG Mimetype Constant - "image/jpeg" */
    // public static final String FORMAT_MIMEYPE_JPEG = "image/jpeg";
    /** PDF Mimetype Constant - "image/jpeg" */
    public static final String FORMAT_MIMEYPE_PDF = "application/pdf";
    // default implementations for defined formats
    public static final String DEFAULT_EXTRACTOR = "gov.lanl.adore.djatoka.kdu.KduExtractExe";
    /** Default JP2 Extractor */
    public static final String DEFAULT_JP2_EXTRACTOR = "gov.lanl.adore.djatoka.kdu.KduExtractExe";
    /** Default JPEG Extractor */
    public static final String DEFAULT_JPEG_EXTRACTOR = "gov.lanl.adore.djatoka.plugin.ExtractJPG";
    /** Default PDF Extractor */
    public static final String DEFAULT_PDF_EXTRACTOR = "gov.lanl.adore.djatoka.plugin.ExtractPDF";

    private static HashMap<String, Class> extractorsImpl = new HashMap<String, Class>();
    private static HashMap<String, IExtract> extractors = new HashMap<String, IExtract>();
    private static HashMap<String, DjatokaExtractProcessor> djatokaExtractors = new HashMap<String, DjatokaExtractProcessor>();

    /** MIME util */
    private static OpendesktopMimeDetector opendesktopMimeDetector = new OpendesktopMimeDetector();
    private static final int MAX_CONCURRENT_DETECTIONS = 1;
    private static final Semaphore detectorRateLimit = new Semaphore(MAX_CONCURRENT_DETECTIONS, true); // true: fair => first-in, first-out


    /**
     * Default Constructor, uses default format map.
     */
    public ExtractorFactory() {
        this(getDefaultFormatMap());
    }
   
   
    /**
     * Create a new ExtractorFactory using provided format map. Format maps
     * must be key/value pair of syntax $mimetype=$impl
     * (e.g. image/jpeg=gov.lanl.adore.djatoka.kdu.KduExtractExe)
     * @param formatMap
     */
    public ExtractorFactory(Properties formatMap) {
        for (Map.Entry<Object, Object> i : formatMap.entrySet()) {
            String k = (String) i.getKey();
            String v = (String) i.getValue();
            try {
                Class<?> impl = Class.forName(v);
                if (k != null && impl != null)
                    extractorsImpl.put(k, impl);
            } catch (ClassNotFoundException e) {
                System.err.println("Class not found for format " + k + ": " + v);
                logger.error(e);
            }
        }
    }

   
    /**
     * Create a new ExtractorFactory using provided format map. Format maps
     * must be key/value pair of syntax $mimetype=$impl
     * (e.g. image/jpeg=gov.lanl.adore.djatoka.kdu.KduExtractExe)
     * @return Properties object containing extractor implementation class key/value pairs
     */
    public static Properties getDefaultFormatMap() {
        Properties formatMap = new Properties();
        formatMap.put(FORMAT_MIMEYPE_JP2, DEFAULT_JP2_EXTRACTOR);
        formatMap.put(FORMAT_MIMEYPE_JPEG, DEFAULT_JPEG_EXTRACTOR);
        formatMap.put(FORMAT_MIMEYPE_PDF, DEFAULT_PDF_EXTRACTOR);
        formatMap.put(DEFAULT_EXTRACTOR, DEFAULT_JP2_EXTRACTOR);
        return formatMap;
    }


    /**
     * Returns format extractor implementation for provided format identifier
     * @param format identifier of requested identifier
     * @return format extractor for provided format identifier
     */
    public IExtract getExtractorInstanceForFile(String file) {
        try {
            String format = getMimetypeForFile(file);
            return getExtractorInstanceForFormat(format);
        } catch (IOException ex) {
            logger.log(Priority.FATAL, null, ex);
        }
        return null;
    }

   
    /**
     * Returns format writer implementation for provided format identifier
     * @param format identifier of requested identifier
     * @return format writer for provided format identifier
     */
    public DjatokaExtractProcessor getDjatokaExtractorProcessorForFile(String file) {
        try {
            String format = getMimetypeForFile(file);
            return getDjatokaExtractorProcessorForFormat(format);
        } catch (IOException ex) {
            logger.log(Priority.FATAL, null, ex);
        }
        return null;
    }


    /**
     * Get mimetype for 'file' based on its content.
     * @param file Doesn't need to have an extension.
     * @return Most probable mimetype.
     * @throws FileNotFoundException
     * @throws MimeException
     */
    public static String getMimetypeForFile(String file) throws FileNotFoundException, MimeException {
        if (MAX_CONCURRENT_DETECTIONS > 0) {
            try {
                if (!detectorRateLimit.tryAcquire(0, TimeUnit.SECONDS)) {
                    logger.debug("Waiting for semaphore");
                    detectorRateLimit.acquire();
                    logger.debug("Acquired semaphore");
                }
            } catch (InterruptedException e) {
                // Shouldn't happen?
                logger.error("MimeType detection interrupted waiting for semaphore", e);
            }
        }

        BufferedInputStream bis = null;
        Collection<String> coll = null;
        try {
            bis = new BufferedInputStream(new FileInputStream(file));
            coll = opendesktopMimeDetector.getMimeTypesInputStream(bis);
            logger.debug("coll size: " + (coll == null ? "null" : coll.size()) + "; coll: " + coll.toString());
            return (String) (coll.size() > 0 ? coll.toArray()[0] : "");
        } catch (IllegalArgumentException ex) { // Trying to circumvent a bug in mime-util (see http://sourceforge.net/tracker/?func=detail&aid=3007610&group_id=205064&atid=992132#).
            int max_tries = 2;
            int next_try = 1;
            while (coll == null && next_try <= max_tries) {
                logger.error("Exception in MimeDetector; retrying " + next_try + " of " + max_tries + " try(ies)", ex);
                coll = opendesktopMimeDetector.getMimeTypesInputStream(bis);
            }
            if (coll == null) return "";
            else return (String) (coll.size() > 0 ? coll.toArray()[0] : "");
        } catch (Exception ex) {
            logger.error("Exception in MimeDetector", ex);
            return "";
        } finally {
            if (MAX_CONCURRENT_DETECTIONS > 0) detectorRateLimit.release();
            try {
                if (bis != null) bis.close();
            } catch (IOException ex) {
                logger.error("Closing file stream", ex);
            }
        }
    }

   
    public IExtract getExtractorInstanceForFormat(String format) {
        try {
            if (extractors.containsKey(format)) {
                return extractors.get(format);
            } else if (extractorsImpl.containsKey(format)) {
                extractors.put(format, (IExtract) extractorsImpl.get(format).newInstance());
                return extractors.get(format);
            } else {
                if (extractors.containsKey(DEFAULT_EXTRACTOR)) {
                    return extractors.get(DEFAULT_EXTRACTOR);
                } else if (extractorsImpl.containsKey(DEFAULT_EXTRACTOR)) {
                    extractors.put(DEFAULT_EXTRACTOR, (IExtract) extractorsImpl.get(DEFAULT_EXTRACTOR).newInstance());
                    return extractors.get(DEFAULT_EXTRACTOR);
                }
            }
        } catch (InstantiationException ex) {
            logger.log(Priority.FATAL, null, ex);
        } catch (IllegalAccessException ex) {
            logger.log(Priority.FATAL, null, ex);
        }
        extractors.put(DEFAULT_EXTRACTOR, (IExtract) new KduExtractExe());
        return extractors.get(DEFAULT_EXTRACTOR);
    }

   
    public DjatokaExtractProcessor getDjatokaExtractorProcessorForFormat(String format) {
        if (djatokaExtractors.containsKey(format)) {
            return djatokaExtractors.get(format);
        } else {
            djatokaExtractors.put(format, new DjatokaExtractProcessor(getExtractorInstanceForFormat(format)));
            return djatokaExtractors.get(format);
        }
    }
}
TOP

Related Classes of gov.lanl.adore.djatoka.io.ExtractorFactory

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.