/*
* Copyright (c) 2010 Brasiliana Digital Library, 2008 Los Alamos National Security, LLC.
*
* Brasiliana Digital Library
* http://www.brasiliana.usp.br
*
* Los Alamos National Laboratory
* Research Library
* Digital Library Research & Prototyping Team
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
package gov.lanl.adore.djatoka.io;
import eu.medsea.mimeutil.MimeException;
import eu.medsea.mimeutil.detector.OpendesktopMimeDetector;
import gov.lanl.adore.djatoka.DjatokaExtractProcessor;
import gov.lanl.adore.djatoka.IExtract;
import gov.lanl.adore.djatoka.kdu.KduExtractExe;
import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.Semaphore;
import java.util.concurrent.TimeUnit;
import org.apache.log4j.Logger;
import org.apache.log4j.Priority;
/**
* Extractor Factory. Uses format writer/reader implementations.
* @author Fabio Kepler
*
*/
public class ExtractorFactory implements FormatConstants {
static Logger logger = Logger.getLogger(ExtractorFactory.class);
// Mimetypes for supported extractor formats
/** JP2 Mimetype Constant - "image/jp2" */
// public static final String FORMAT_MIMEYPE_JP2 = "image/jp2";
/** JPEG Mimetype Constant - "image/jpeg" */
// public static final String FORMAT_MIMEYPE_JPEG = "image/jpeg";
/** PDF Mimetype Constant - "image/jpeg" */
public static final String FORMAT_MIMEYPE_PDF = "application/pdf";
// default implementations for defined formats
public static final String DEFAULT_EXTRACTOR = "gov.lanl.adore.djatoka.kdu.KduExtractExe";
/** Default JP2 Extractor */
public static final String DEFAULT_JP2_EXTRACTOR = "gov.lanl.adore.djatoka.kdu.KduExtractExe";
/** Default JPEG Extractor */
public static final String DEFAULT_JPEG_EXTRACTOR = "gov.lanl.adore.djatoka.plugin.ExtractJPG";
/** Default PDF Extractor */
public static final String DEFAULT_PDF_EXTRACTOR = "gov.lanl.adore.djatoka.plugin.ExtractPDF";
private static HashMap<String, Class> extractorsImpl = new HashMap<String, Class>();
private static HashMap<String, IExtract> extractors = new HashMap<String, IExtract>();
private static HashMap<String, DjatokaExtractProcessor> djatokaExtractors = new HashMap<String, DjatokaExtractProcessor>();
/** MIME util */
private static OpendesktopMimeDetector opendesktopMimeDetector = new OpendesktopMimeDetector();
private static final int MAX_CONCURRENT_DETECTIONS = 1;
private static final Semaphore detectorRateLimit = new Semaphore(MAX_CONCURRENT_DETECTIONS, true); // true: fair => first-in, first-out
/**
* Default Constructor, uses default format map.
*/
public ExtractorFactory() {
this(getDefaultFormatMap());
}
/**
* Create a new ExtractorFactory using provided format map. Format maps
* must be key/value pair of syntax $mimetype=$impl
* (e.g. image/jpeg=gov.lanl.adore.djatoka.kdu.KduExtractExe)
* @param formatMap
*/
public ExtractorFactory(Properties formatMap) {
for (Map.Entry<Object, Object> i : formatMap.entrySet()) {
String k = (String) i.getKey();
String v = (String) i.getValue();
try {
Class<?> impl = Class.forName(v);
if (k != null && impl != null)
extractorsImpl.put(k, impl);
} catch (ClassNotFoundException e) {
System.err.println("Class not found for format " + k + ": " + v);
logger.error(e);
}
}
}
/**
* Create a new ExtractorFactory using provided format map. Format maps
* must be key/value pair of syntax $mimetype=$impl
* (e.g. image/jpeg=gov.lanl.adore.djatoka.kdu.KduExtractExe)
* @return Properties object containing extractor implementation class key/value pairs
*/
public static Properties getDefaultFormatMap() {
Properties formatMap = new Properties();
formatMap.put(FORMAT_MIMEYPE_JP2, DEFAULT_JP2_EXTRACTOR);
formatMap.put(FORMAT_MIMEYPE_JPEG, DEFAULT_JPEG_EXTRACTOR);
formatMap.put(FORMAT_MIMEYPE_PDF, DEFAULT_PDF_EXTRACTOR);
formatMap.put(DEFAULT_EXTRACTOR, DEFAULT_JP2_EXTRACTOR);
return formatMap;
}
/**
* Returns format extractor implementation for provided format identifier
* @param format identifier of requested identifier
* @return format extractor for provided format identifier
*/
public IExtract getExtractorInstanceForFile(String file) {
try {
String format = getMimetypeForFile(file);
return getExtractorInstanceForFormat(format);
} catch (IOException ex) {
logger.log(Priority.FATAL, null, ex);
}
return null;
}
/**
* Returns format writer implementation for provided format identifier
* @param format identifier of requested identifier
* @return format writer for provided format identifier
*/
public DjatokaExtractProcessor getDjatokaExtractorProcessorForFile(String file) {
try {
String format = getMimetypeForFile(file);
return getDjatokaExtractorProcessorForFormat(format);
} catch (IOException ex) {
logger.log(Priority.FATAL, null, ex);
}
return null;
}
/**
* Get mimetype for 'file' based on its content.
* @param file Doesn't need to have an extension.
* @return Most probable mimetype.
* @throws FileNotFoundException
* @throws MimeException
*/
public static String getMimetypeForFile(String file) throws FileNotFoundException, MimeException {
if (MAX_CONCURRENT_DETECTIONS > 0) {
try {
if (!detectorRateLimit.tryAcquire(0, TimeUnit.SECONDS)) {
logger.debug("Waiting for semaphore");
detectorRateLimit.acquire();
logger.debug("Acquired semaphore");
}
} catch (InterruptedException e) {
// Shouldn't happen?
logger.error("MimeType detection interrupted waiting for semaphore", e);
}
}
BufferedInputStream bis = null;
Collection<String> coll = null;
try {
bis = new BufferedInputStream(new FileInputStream(file));
coll = opendesktopMimeDetector.getMimeTypesInputStream(bis);
logger.debug("coll size: " + (coll == null ? "null" : coll.size()) + "; coll: " + coll.toString());
return (String) (coll.size() > 0 ? coll.toArray()[0] : "");
} catch (IllegalArgumentException ex) { // Trying to circumvent a bug in mime-util (see http://sourceforge.net/tracker/?func=detail&aid=3007610&group_id=205064&atid=992132#).
int max_tries = 2;
int next_try = 1;
while (coll == null && next_try <= max_tries) {
logger.error("Exception in MimeDetector; retrying " + next_try + " of " + max_tries + " try(ies)", ex);
coll = opendesktopMimeDetector.getMimeTypesInputStream(bis);
}
if (coll == null) return "";
else return (String) (coll.size() > 0 ? coll.toArray()[0] : "");
} catch (Exception ex) {
logger.error("Exception in MimeDetector", ex);
return "";
} finally {
if (MAX_CONCURRENT_DETECTIONS > 0) detectorRateLimit.release();
try {
if (bis != null) bis.close();
} catch (IOException ex) {
logger.error("Closing file stream", ex);
}
}
}
public IExtract getExtractorInstanceForFormat(String format) {
try {
if (extractors.containsKey(format)) {
return extractors.get(format);
} else if (extractorsImpl.containsKey(format)) {
extractors.put(format, (IExtract) extractorsImpl.get(format).newInstance());
return extractors.get(format);
} else {
if (extractors.containsKey(DEFAULT_EXTRACTOR)) {
return extractors.get(DEFAULT_EXTRACTOR);
} else if (extractorsImpl.containsKey(DEFAULT_EXTRACTOR)) {
extractors.put(DEFAULT_EXTRACTOR, (IExtract) extractorsImpl.get(DEFAULT_EXTRACTOR).newInstance());
return extractors.get(DEFAULT_EXTRACTOR);
}
}
} catch (InstantiationException ex) {
logger.log(Priority.FATAL, null, ex);
} catch (IllegalAccessException ex) {
logger.log(Priority.FATAL, null, ex);
}
extractors.put(DEFAULT_EXTRACTOR, (IExtract) new KduExtractExe());
return extractors.get(DEFAULT_EXTRACTOR);
}
public DjatokaExtractProcessor getDjatokaExtractorProcessorForFormat(String format) {
if (djatokaExtractors.containsKey(format)) {
return djatokaExtractors.get(format);
} else {
djatokaExtractors.put(format, new DjatokaExtractProcessor(getExtractorInstanceForFormat(format)));
return djatokaExtractors.get(format);
}
}
}