/*******************************************************************************
* Copyright 2012, The Infinit.e Open Source Project.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
******************************************************************************/
package com.ikanow.infinit.e.harvest.extraction.document.file;
import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringWriter;
import java.io.UnsupportedEncodingException;
import java.lang.reflect.Method;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.UnknownHostException;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Arrays;
import java.util.Date;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import javax.xml.stream.FactoryConfigurationError;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;
import jcifs.smb.NtlmPasswordAuthentication;
import jcifs.smb.SmbException;
import org.apache.tika.Tika;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.txt.TXTParser;
import org.bson.types.ObjectId;
import org.xml.sax.ContentHandler;
import org.apache.commons.codec.digest.DigestUtils;
import com.google.gson.JsonElement;
import com.google.gson.JsonParser;
import com.google.gson.stream.JsonReader;
import com.ikanow.infinit.e.data_model.InfiniteEnums;
import com.ikanow.infinit.e.data_model.InfiniteEnums.ExtractorSourceLevelMajorException;
import com.ikanow.infinit.e.data_model.InfiniteEnums.HarvestEnum;
import com.ikanow.infinit.e.data_model.store.config.source.SourceFileConfigPojo;
import com.ikanow.infinit.e.data_model.store.config.source.SourceFileConfigPojo.StreamingType;
import com.ikanow.infinit.e.data_model.store.config.source.SourcePipelinePojo;
import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo;
import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
import com.ikanow.infinit.e.data_model.store.document.GeoPojo;
import com.ikanow.infinit.e.harvest.HarvestContext;
import com.ikanow.infinit.e.harvest.extraction.document.HarvesterInterface;
import com.ikanow.infinit.e.harvest.extraction.document.DuplicateManager;
import com.ikanow.infinit.e.harvest.utils.AuthUtils;
import com.ikanow.infinit.e.harvest.utils.HarvestExceptionUtils;
import com.ikanow.infinit.e.harvest.utils.PropertiesManager;
public class FileHarvester implements HarvesterInterface {
@SuppressWarnings("unused")
private static final byte[] SP = " ".getBytes();
private int maxDepth;
private Set<Integer> sourceTypesCanHarvest = new HashSet<Integer>();
private int maxDocsPerCycle = Integer.MAX_VALUE;
@SuppressWarnings("unused")
private static final String TYPES[] = {
"TYPE_COMM",
"TYPE_FILESYSTEM",
"TYPE_NAMED_PIPE",
"TYPE_PRINTER",
"TYPE_SERVER",
"TYPE_SHARE",
"TYPE_WORKGROUP"
};
private int errors = 0;
// List of Feeds
private List<DocumentPojo> files = null;
private List<DocumentPojo> docsToAdd = null;
private List<DocumentPojo> docsToUpdate = null;
private List<DocumentPojo> docsToRemove = null;
private boolean _deleteExistingFilesBySourceKey = false;
private HashSet<String> sourceUrlsGettingUpdated = null;
// (tells us source URLs that are being deleted)
private HarvestContext _context;
// Some internal state
private boolean _streaming = false; // (new mode, currently unused)
private boolean _customJob = false; // (some logic is different)
private Date _customLastRecordWritten = null;
// Formatting office docs: allows HTML/XML output and to push options from the parsers into the tika instance
private Tika _tika = null;
ContentHandler _tikaOutputFormat = null;
StringWriter _tikaXmlFormatWriter;
ParseContext _tikaOutputParseContext = null;
// Can specify regexes to select which files to ignore
private Pattern includeRegex = null; // files only
private Pattern excludeRegex = null; // files and paths
// Security:
private boolean harvestSecureMode = false;
// Try to avoid blowing up the memory:
private long _memUsage = 0;
/**
* Get a specific doc to return the bytes for
* @throws Exception
*/
public static byte[] getFile(String fileURL, SourcePojo source ) throws Exception
{
InputStream in = null;
try
{
InfiniteFile searchFile = searchFileShare( source, fileURL);
if ( searchFile == null )
return null;
else
{
//found the file, return the bytes
in = searchFile.getInputStream();
if (null == in)
return null;
ByteArrayOutputStream buffer = new ByteArrayOutputStream();
int read;
byte[] data = new byte[16384];
while ( (read = in.read(data, 0, data.length)) != -1 )
{
buffer.write(data,0,read);
}
buffer.flush();
return buffer.toByteArray();
}
}
catch (Exception e)
{
throw e;
}
finally {
if (null != in) {
in.close();
}
}
}
/**
* Same as the traverse method but returns the InfiniteFile if it finds searchFile
* returns null otherwise
*
* @param f
* @param source
* @param depth
* @param searchFile
* @return
* @throws SmbException
*/
private static InfiniteFile searchFileShare( SourcePojo source, String searchFile ) throws Exception
{
// Made this synchronized to work around what looks like deadlock issue in code
// This is undesirable and should be fixed once the underlying bug has been fixed
// (note in practice this is only an issue for multiple threads going to the same domain)
InfiniteFile f;
synchronized (FileHarvester.class) {
try {
if (null != source.getProcessingPipeline()) { // new style...
SourcePipelinePojo firstElement = source.getProcessingPipeline().iterator().next();
source.setFileConfig(firstElement.file);
source.setUrl(firstElement.file.getUrl());
}//TESTED
if (source.getUrl().startsWith("inf://")) { // Infinit.e share/custom object
NtlmPasswordAuthentication auth = new NtlmPasswordAuthentication(source.getCommunityIds().iterator().next().toString(), source.getOwnerId().toString(), null);
f = InfiniteFile.create(source.getUrl(), auth);
if (f.isDirectory()) {
InfiniteFile subs[] = f.listFiles();
for (InfiniteFile sub: subs) {
if (sub.isDirectory()) { // (can only nest once)
InfiniteFile subs2[] = sub.listFiles();
for (InfiniteFile sub2: subs2) {
if (sub2.getUrlString().equals(searchFile)) {
return sub2;
}//TOTEST
}
}//(end loop ove sub-dirs)
else if (sub.getUrlString().equals(searchFile)) {
return sub;
}//TOTEST
}//(end loop over dirs)
}//TOTEST
}//TODO (INF-2122): TOTEST
else if( source.getFileConfig() == null || source.getFileConfig().password == null || source.getFileConfig().username == null)
{
f = InfiniteFile.create(searchFile);
}
else
{
if (source.getFileConfig().domain == null) {
source.getFileConfig().domain = "";
}
NtlmPasswordAuthentication auth = new NtlmPasswordAuthentication(source.getFileConfig().domain, source.getFileConfig().username, source.getFileConfig().password);
f = InfiniteFile.create(searchFile, auth);
}
}//TESTED
catch (Exception e) {
int nIndex = searchFile.lastIndexOf("/");
searchFile = searchFile.substring(0, nIndex); // (ie not including the /)
f = searchFileShare(source, searchFile);
if (f.isDirectory()) {
throw new MalformedURLException(searchFile + " is directory.");
}
}//TESTED
return f;
}
// (End INF-1406 sync bug, see above explanation)
} //TESTED
/**
* Get the list of docs
* @return
* @throws Exception
*/
private List<DocumentPojo> getFiles(SourcePojo source) throws Exception {
InfiniteFile file = null;
_deleteExistingFilesBySourceKey = false;
try
{
if (source.getUrl().startsWith("inf://")) { // Infinit.e share/custom object
NtlmPasswordAuthentication auth = new NtlmPasswordAuthentication(Arrays.toString(source.getCommunityIds().toArray()), source.getOwnerId().toString(), null);
file = InfiniteFile.create(source.getUrl(), auth);
if (source.getUrl().startsWith("inf://custom/")) {
_customJob = true;
// A few cases:
// 1] If first time, or source has completed:
// Quick check of share/custom date vs last imported doc in this case:
ObjectId customLastRecordId = null;
// Here are the two cases (whether in success/error/success_iteration
// 1) non-append mode ... any time the first_record.time > last_doc.time then re-run (delete all docs)
// 2) append-mode ... any time the last_record.time > last_doc.time then re-run/keep going
// (the status clause below just determines if you keep going or not)
// the file.getTime() call will automatically give you the correct version of 1 vs 2 depending on its status)
if ((null == source.getHarvestStatus()) || (HarvestEnum.success == source.getHarvestStatus().getHarvest_status()))
{
if (!_context.getDuplicateManager().needsUpdated_Url(new Date(file.getDate()), null, source)) {
return files;
}//TESTED
else {
_customLastRecordWritten = _context.getDuplicateManager().getLastModifiedDate();
customLastRecordId = _context.getDuplicateManager().getLastModifiedDocId();
_context.getDuplicateManager().resetForNewSource();
// (reset the saved state since I faked my harvest status)
_deleteExistingFilesBySourceKey = true;
}//TESTED
}
else { // 2] If in the middle of a multiple harvest cycle....
// Specifically for custom, need to handle m/r changing ... we'll fake the harvest status
// to force it to check the last doc's modified time vs the current file time...
HarvestEnum saved = source.getHarvestStatus().getHarvest_status();
source.getHarvestStatus().setHarvest_status(HarvestEnum.success);
try {
if (_context.getDuplicateManager().needsUpdated_Url(new Date(file.getDate()), null, source)) {
_deleteExistingFilesBySourceKey = true;
}
_customLastRecordWritten = _context.getDuplicateManager().getLastModifiedDate();
customLastRecordId = _context.getDuplicateManager().getLastModifiedDocId();
_context.getDuplicateManager().resetForNewSource();
// (reset the saved state since I faked my harvest status)
}
finally { // (rewrite original)
source.getHarvestStatus().setHarvest_status(saved);
}
}//TESTED
if (_streaming) { // Never delete files...
_deleteExistingFilesBySourceKey = false;
}//TESTED
if (null == customLastRecordId) { // no docs, so no need for this
// (or -in the case of distributed sources- the new harvest has already begun)
_deleteExistingFilesBySourceKey = false;
}//TESTED
// Custom append mode: never delete anything, only process new objects
InternalInfiniteFile customHandle = (InternalInfiniteFile)file;
if (customHandle.isAppendingNotReplacing()) {
_deleteExistingFilesBySourceKey = false;
}//TESTED
// Finally, if we wanted to delete the files then go ahead now:
if (_deleteExistingFilesBySourceKey) {
// For now, support only "non-append" mode efficiently:
// Always delete all the old docs, updated docs will work but inefficiently (will delete and re-create)
DocumentPojo docRepresentingSrcKey = new DocumentPojo();
if (null != source.getDistributionFactor()) {
// If split across multiple docs then need a more expensive delete (note: still indexed)
docRepresentingSrcKey.setId(customLastRecordId);
}
docRepresentingSrcKey.setCommunityId(source.getCommunityIds().iterator().next());
docRepresentingSrcKey.setSourceKey(source.getKey());
this.docsToRemove.add(docRepresentingSrcKey);
}//TESTED
}
else { // share - this is much simpler:
if (!_context.getDuplicateManager().needsUpdated_Url(new Date(file.getDate()), null, source)) {
return files;
}//TESTED
}
}//TESTED
else if( source.getFileConfig() == null || source.getFileConfig().password == null || source.getFileConfig().username == null)
{
// Local file: => must be admin to continue
if (harvestSecureMode) { // secure mode, must be admin
if (source.getUrl().startsWith("file:")) {
if (!AuthUtils.isAdmin(source.getOwnerId())) {
throw new ExtractorSourceLevelMajorException("Permission denied");
}
}
}//TODO (INF-2119): come up with something better than this...(this is at least consistent with SAH/UAH security, apart from allowing admin more rights)
file = InfiniteFile.create(source.getUrl());
}
else
{
if (source.getFileConfig().domain == null) {
source.getFileConfig().domain = "";
}
NtlmPasswordAuthentication auth = new NtlmPasswordAuthentication(source.getFileConfig().domain, source.getFileConfig().username, source.getFileConfig().password);
file = InfiniteFile.create(source.getUrl(), auth);
}
traverse(file, source, maxDepth);
}
catch (Exception e) {
// If an exception here this is catastrophic, throw it upwards:
errors++;
throw e;
}
return files;
}
/**
* Constructor for processing doc information for a source
* @param maxDepth
*/
public FileHarvester()
{
sourceTypesCanHarvest.add(InfiniteEnums.FILES);
maxDepth = 5;
PropertiesManager pm = new PropertiesManager();
maxDocsPerCycle = pm.getMaxDocsPerSource();
harvestSecureMode = pm.getHarvestSecurity();
}
// Process the doc
private void processFiles(SourcePojo source) throws Exception {
// Can override system settings if less:
if ((null != source.getThrottleDocs()) && (source.getThrottleDocs() < maxDocsPerCycle)) {
maxDocsPerCycle = source.getThrottleDocs();
}
sourceUrlsGettingUpdated = new HashSet<String>();
LinkedList<String> duplicateSources = new LinkedList<String>();
try {
// Compile regexes if they are present
if ((null != source.getFileConfig()) && (null != source.getFileConfig().pathInclude)) {
includeRegex = Pattern.compile(source.getFileConfig().pathInclude, Pattern.CASE_INSENSITIVE);
}
if ((null != source.getFileConfig()) && (null != source.getFileConfig().pathExclude)) {
excludeRegex = Pattern.compile(source.getFileConfig().pathExclude, Pattern.CASE_INSENSITIVE);
}
if ((null != source.getFileConfig()) && (null != source.getFileConfig().maxDepth)) {
this.maxDepth = source.getFileConfig().maxDepth;
}
// Process the fileshare
getFiles(source);
}
catch (Exception e) {
// If an exception here this is catastrophic, throw it upwards:
errors++;
throw e;
}
try {
//Dedup code, ironically enough partly duplicated in parse(), probably unnecessarily
DuplicateManager qr = _context.getDuplicateManager();
for(DocumentPojo doc: files)
{
try {
duplicateSources.clear();
if (null != doc.getSourceUrl()) {
boolean add = true;
// However still need to check for duplicates so can update entities correctly (+maintain _ids, etc)
// We only do this if the source URL changes (unless URL is taken from the object in which case all bets are off)
boolean sourceUrlUpdated = sourceUrlsGettingUpdated.contains(doc.getSourceUrl());
if (!doc.getHasDefaultUrl() || sourceUrlUpdated) { // src URL for a given URL
// (only if the the sourceUrl is not new...)
if (qr.isDuplicate_Url(doc.getUrl(), source, duplicateSources)) {
doc.setUpdateId(qr.getLastDuplicateId()); // (set _id to doc we're going to replace)
if (!sourceUrlUpdated && !_deleteExistingFilesBySourceKey) {
// Here update instead so we delete the old doc and add the new one
add = false;
docsToUpdate.add(doc);
}//TESTED
else {
// (else *still* don't add this to updates because we've added the source URL or source key to the delete list)
// (hence approximate create with the updateId...)
if (null != doc.getUpdateId()) {
doc.setCreated(new Date(doc.getUpdateId().getTime()));
}//TESTED
}//TESTED
}
//(note we don't get about duplicate sources in this case - just too complex+rare a case)
}//TESTED (src url changing, different src url, non-default URL)
// For composite files we (almost always) delete everything that already exists (via docsToRemove) and then add new docs
if (add) {
docsToAdd.add(doc);
}
//TESTED
}
else if (qr.isDuplicate_Url(doc.getUrl(), source, duplicateSources)) {
// Other files, if the file already exists then update it (essentially, delete/add)
doc.setUpdateId(qr.getLastDuplicateId()); // (set _id to doc we're going to replace)
docsToUpdate.add(doc);
}
else { // if duplicateSources is non-empty then this URL is a duplicate of one from a different source
if (!duplicateSources.isEmpty()) {
doc.setDuplicateFrom(duplicateSources.getFirst());
}
docsToAdd.add(doc);
}
}
catch (Exception e) {
errors++;
_context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
}
}
}
catch (Exception e) {
// If an exception here this is catastrophic, throw it upwards:
errors++;
throw e;
}
}
private void parse( InfiniteFile f, SourcePojo source ) throws MalformedURLException, URISyntaxException {
//NOTE: we only ever break out of here because of max docs in standalone mode
// (because we don't know how to continue reading)
DocumentPojo doc = null;
//Determine File Extension
String fileName = f.getName().toString();
int mid= fileName.lastIndexOf(".");
String extension = fileName.substring(mid+1,fileName.length());
//Checked to save processing time
long fileTimestamp = (f.getDate()/1000)*1000;
// (ensure truncated to seconds, since some operation somewhere hear does this...)
Date modDate = new Date(fileTimestamp);
//XML Data gets placed into MetaData
boolean bIsXml = false;
boolean bIsJson = false;
boolean bIsLineOriented = false;
if ((null != source.getFileConfig()) && (null != source.getFileConfig().type)) {
extension = source.getFileConfig().type;
}
bIsXml = extension.equalsIgnoreCase("xml");
bIsJson = extension.equalsIgnoreCase("json");
bIsLineOriented = extension.endsWith("sv");
if (bIsXml || bIsJson || bIsLineOriented)
{
int debugMaxDocs = Integer.MAX_VALUE; // by default don't set this, it's only for debug mode
if (_context.isStandalone()) { // debug mode
debugMaxDocs = maxDocsPerCycle;
}
//fast check to see if the file has changed before processing (or if it never existed)
if(needsUpdated_SourceUrl(modDate, f.getUrlString(), source))
{
if (0 != modDate.getTime()) { // if it ==0 then sourceUrl doesn't exist at all, no need to delete
// This file already exists - in normal/managed mode will re-create
// In streaming mode, simple skip over
if (_streaming) {
return;
}//TESTED
DocumentPojo docRepresentingSrcUrl = new DocumentPojo();
docRepresentingSrcUrl.setSourceUrl(f.getUrlString());
docRepresentingSrcUrl.setSourceKey(source.getKey());
docRepresentingSrcUrl.setCommunityId(source.getCommunityIds().iterator().next());
sourceUrlsGettingUpdated.add(docRepresentingSrcUrl.getSourceUrl());
this.docsToRemove.add(docRepresentingSrcUrl);
// (can add documents with just source URL, are treated differently in the core libraries)
}
SourceFileConfigPojo fileSystem = source.getFileConfig();
if ((null == fileSystem) && (bIsXml || bIsJson)) {
fileSystem = new SourceFileConfigPojo();
}
XmlToMetadataParser xmlParser = null;
JsonToMetadataParser jsonParser = null;
String urlType = extension;
if (bIsXml) {
xmlParser = new XmlToMetadataParser(fileSystem.XmlRootLevelValues,
fileSystem.XmlIgnoreValues, fileSystem.XmlSourceName, fileSystem.XmlPrimaryKey,
fileSystem.XmlAttributePrefix, fileSystem.XmlPreserveCase, debugMaxDocs);
}//TESTED
else if (bIsJson) {
jsonParser = new JsonToMetadataParser(fileSystem.XmlSourceName, fileSystem.XmlRootLevelValues, fileSystem.XmlPrimaryKey, fileSystem.XmlIgnoreValues, debugMaxDocs);
}//TESTED
List<DocumentPojo> partials = null;
try {
if (bIsXml) {
XMLStreamReader xmlStreamReader = null;
XMLInputFactory factory = XMLInputFactory.newInstance();
factory.setProperty(XMLInputFactory.IS_COALESCING, true);
factory.setProperty(XMLInputFactory.SUPPORT_DTD, false);
try {
xmlStreamReader = factory.createXMLStreamReader(f.getInputStream());
partials = xmlParser.parseDocument(xmlStreamReader);
_memUsage += xmlParser.getMemUsage();
}
finally {
if (null != xmlStreamReader) xmlStreamReader.close();
}
}//TESTED
else if (bIsJson) {
JsonReader jsonReader = null;
try {
jsonReader = new JsonReader(new InputStreamReader(f.getInputStream(), "UTF-8"));
jsonReader.setLenient(true);
partials = jsonParser.parseDocument(jsonReader);
_memUsage += jsonParser.getMemUsage();
}
finally {
if (null != jsonReader) jsonReader.close();
}
}//TESTED
else if (bIsLineOriented) { // Just generate a document for every line
BufferedReader lineReader = null;
try {
lineReader = new BufferedReader(new InputStreamReader(f.getInputStream(), "UTF-8"));
CsvToMetadataParser lineParser = new CsvToMetadataParser(debugMaxDocs);
partials = lineParser.parseDocument(lineReader, source);
_memUsage += lineParser.getMemUsage();
}
finally {
if (null != lineReader) lineReader.close();
}
}//TESTED
MessageDigest md5 = null; // (generates unique urls if the user doesn't below)
try {
md5 = MessageDigest.getInstance("MD5");
} catch (NoSuchAlgorithmException e) {
// Do nothing, unlikely to happen...
}
int nIndex = 0;
int numPartials = partials.size();
for (DocumentPojo doctoAdd : partials)
{
nIndex++;
doctoAdd.setSource(source.getTitle());
doctoAdd.setSourceKey(source.getKey());
doctoAdd.setMediaType(source.getMediaType());
doctoAdd.setModified(new Date(fileTimestamp));
doctoAdd.setCreated(new Date());
if(null == doctoAdd.getUrl()) { // Can be set in the parser or here
doctoAdd.setHasDefaultUrl(true); // (ie cannot occur in a different src URL)
if (1 == numPartials) {
String urlString = f.getUrlString();
if (urlString.endsWith(urlType)) {
doctoAdd.setUrl(urlString);
}
else {
doctoAdd.setUrl(new StringBuffer(urlString).append('.').append(urlType).toString());
}
// (we always set sourceUrl as the true url of the file, so want to differentiate the URL with
// some useful information)
}
else if (null == doctoAdd.getMetadata()) { // Line oriented case
doctoAdd.setUrl(new StringBuffer(f.getUrlString()).append("/").append(nIndex).append('.').append(urlType).toString());
}
else {
if (null == md5) { // Will never happen, MD5 always exists
doctoAdd.setUrl(new StringBuffer(f.getUrlString()).append("/").append(doctoAdd.getMetadata().hashCode()).append('.').append(urlType).toString());
}
else { // This is the standard call if the XML parser has not been configured to build the URL
doctoAdd.setUrl(new StringBuffer(f.getUrlString()).append("/").append(DigestUtils.md5Hex(doctoAdd.getMetadata().toString())).append('.').append(urlType).toString());
}
}//TESTED
}
doctoAdd.setTitle(f.getName().toString());
doctoAdd.setPublishedDate(new Date(fileTimestamp));
doctoAdd.setSourceUrl(f.getUrlString());
// Always add to files because I'm deleting the source URL
files.add(doctoAdd);
}//TESTED
} catch (XMLStreamException e1) {
errors++;
_context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e1).toString(), true);
} catch (FactoryConfigurationError e1) {
errors++;
_context.getHarvestStatus().logMessage(e1.getMessage(), true);
} catch (IOException e1) {
errors++;
_context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e1).toString(), true);
}
catch (Exception e1) {
errors++;
_context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e1).toString(), true);
}
}//(end if needs updated)
}
else //Tika supports Excel,Word,Powerpoint,Visio, & Outlook Documents
{
// (This dedup tells me if it's an add/update vs ignore - qr.isDuplicate higher up tells me if I need to add or update)
if(needsUpdated_Url(modDate, f.getUrlString(), source))
{
Metadata metadata = null;
InputStream in = null;
try {
doc = new DocumentPojo();
// Create a tika object (first time only)
if (null == _tika) {
this.initializeTika(_context, source);
}
// BUGGERY
// NEED TO LIKELY SET LIMIT TO BE 30MB or 50MB and BYPASS ANYTHING OVER THAT BELOW IS THE CODE TO DO THAT
// tika.setMaxStringLength(30*1024*1024);
// Disable the string length limit
_tika.setMaxStringLength(-1);
//input = new FileInputStream(new File(resourceLocation));
// Create a metadata object to contain the metadata
metadata = new Metadata();
// Parse the file and get the text of the file
doc.setSource(source.getTitle());
doc.setSourceKey(source.getKey());
doc.setMediaType(source.getMediaType());
String fullText = "";
in = f.getInputStream();
try {
if (null == _tikaOutputFormat) { // text only
fullText = _tika.parseToString(in, metadata);
}//TESTED
else { // XML/HMTL
_tika.getParser().parse(in, _tikaOutputFormat, metadata, _tikaOutputParseContext);
fullText = _tikaXmlFormatWriter.toString();
_tikaXmlFormatWriter.getBuffer().setLength(0);
}//TESTED
}
finally {
if (null != in) in.close();
}
int descCap = 500;
doc.setFullText(fullText);
if (descCap > fullText.length())
{
descCap = fullText.length();
}
doc.setDescription(fullText.substring(0,descCap));
doc.setModified(new Date(fileTimestamp));
doc.setCreated(new Date());
doc.setUrl(f.getUrlString());
doc.setTitle(f.getName().toString());
doc.setPublishedDate(new Date(fileTimestamp));
_memUsage += (250L*(doc.getFullText().length() + doc.getDescription().length()))/100L; // 25% overhead, 2x for string->byte
// If the metadata contains a more plausible date then use that
try {
String title = metadata.get(Metadata.TITLE);
if (null != title) {
doc.setTitle(title);
}
}
catch (Exception e) { // Fine just carry on
}
try {
Date date = metadata.getDate(Metadata.CREATION_DATE); // MS Word
if (null != date) {
doc.setPublishedDate(date);
}
else {
date = metadata.getDate(Metadata.DATE); // Dublin
if (null != date) {
doc.setPublishedDate(date);
}
else {
date = metadata.getDate(Metadata.ORIGINAL_DATE);
if (null != date) {
doc.setPublishedDate(date);
}
}
}
}
catch (Exception e) { // Fine just carry on
}
//TESTED
// If the metadata contains a geotag then apply that:
try {
String lat = metadata.get(Metadata.LATITUDE);
String lon = metadata.get(Metadata.LONGITUDE);
if ((null != lat) && (null != lon)) {
GeoPojo gt = new GeoPojo();
gt.lat = Double.parseDouble(lat);
gt.lon = Double.parseDouble(lon);
doc.setDocGeo(gt);
}
}
catch (Exception e) { // Fine just carry on
}
// Save the entire metadata:
doc.addToMetadata("_FILE_METADATA_", metadata);
for(ObjectId communityId: source.getCommunityIds())
{
doc.setCommunityId(communityId);
}
files.add(doc);
// Close the input stream
in.close();
in = null;
//TESTED
} catch (SmbException e) {
errors++;
_context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
} catch (MalformedURLException e) {
errors++;
_context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
} catch (UnknownHostException e) {
errors++;
_context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
}
catch (IOException e) {
errors++;
_context.getHarvestStatus().logMessage(e.getMessage(), true);
} catch (TikaException e) {
errors++;
_context.getHarvestStatus().logMessage(e.getMessage(), true);
}
catch (Exception e) {
errors++;
_context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
}
finally { // Close the input stream if an error occurs
if (null != in) {
try {
in.close();
} catch (IOException e) {
// All good, do nothing
}
}
} // end exception handling
} // end dedup check
} // end XML vs "office" app
//DEBUG
//System.out.println("FILE=" + files.size() + " / MEM=" + _memUsage + " VS " + Runtime.getRuntime().totalMemory());
}
private void traverse( InfiniteFile f, SourcePojo source, int depth ) throws Exception {
if( depth == 0 ) {
return;
}
InfiniteFile[] l;
try {
// Made this synchronized to work around what looks like deadlock issue in code
// This is undesirable and should be fixed once the underlying bug has been fixed
// (note in practice this is only an issue for multiple threads going to the same domain)
synchronized (FileHarvester.class) {
if (_customJob && (null != _customLastRecordWritten)) {
l = f.listFiles(_customLastRecordWritten);
}
else {
l = f.listFiles();
}
for(int i = 0; l != null && i < l.length; i++ ) {
if (null == l[i]) break; // (reached the end of the list)
// Check what the deal with memory usage is:
// (Allow 25% of current heap)
if ((_memUsage*4) > Runtime.getRuntime().maxMemory()) {
source.setReachedMaxDocs();
break;
}//TESTED
// Check to see if the item is a directory or a file that needs to parsed
// if it is a file then parse the sucker using tika
// if it is a directory then use recursion to dive into the directory
if (files.size() >= this.maxDocsPerCycle) {
source.setReachedMaxDocs();
break;
}
if( l[i].isDirectory() ) {
// Directories: included unless explicity exclude:
String path = l[i].getUrlPath();
boolean bProcess = true;
if (null != excludeRegex) {
if (excludeRegex.matcher(path).matches()) {
bProcess = false;
}
}//TESTED
if (bProcess) {
traverse( l[i], source, depth - 1 );
if (source.reachedMaxDocs()) { // (express elevator back to recursion root)
return;
}
}
}
else {
boolean bProcess = true;
// Files: check both include and exclude and distribution logic
String path = l[i].getUrlPath();
// Intra-source distribution logic:
if ((null != source.getDistributionTokens()) && (null != source.getDistributionFactor())) {
int split = Math.abs(path.hashCode()) % source.getDistributionFactor();
if (!source.getDistributionTokens().contains(split)) {
bProcess = false;
}
}//TESTED
if (bProcess && (null != includeRegex)) {
if (!includeRegex.matcher(path).matches()) {
bProcess = false;
}
}
if (bProcess && (null != excludeRegex)) {
if (excludeRegex.matcher(path).matches()) {
bProcess = false;
}
}//TESTED
if (bProcess) {
parse( l[i], source);
// (Adds to this.files)
// If we've got here, check what we should do with the file
if (!_context.isStandalone()) {
if ((null != source.getFileConfig()) && (null != source.getFileConfig().renameAfterParse)) {
try {
if (source.getFileConfig().renameAfterParse.isEmpty() || source.getFileConfig().renameAfterParse.equals("."))
{ // delete it
l[i].delete();
}//TESTED
else {
l[i].rename(createNewName(l[i], source.getFileConfig().renameAfterParse));
}//TESTED
}
catch (IOException e) { // doesn't seem worth bombing out but should error
_context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
}
}//TESTED
}
}//(not excluded)
}//(file not directory)
}//(end loop over directory files)
}
// (End INF-1406 sync bug, see above explanation)
} catch (Exception e) {
if (maxDepth == depth) { // Top level error, abandon ship
errors++;
throw e;
}
else { // Already had some luck with this URL keep going
errors++;
_context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true);
}
}
}
private boolean needsUpdated_SourceUrl(Date mod, String sourceUrl, SourcePojo source)
{
try {
DuplicateManager qr = _context.getDuplicateManager();
return qr.needsUpdated_SourceUrl(mod, sourceUrl, source);
}
catch (Exception e) {
// Do nothing
}
return false;
}
private boolean needsUpdated_Url(Date mod, String url, SourcePojo source)
{
try {
DuplicateManager qr = _context.getDuplicateManager();
return qr.needsUpdated_Url(mod, url, source);
}
catch (Exception e) {
// Do nothing
}
return false;
}
@Override
public boolean canHarvestType(int sourceType) {
return sourceTypesCanHarvest.contains(sourceType);
}
@Override
public void executeHarvest(HarvestContext context, SourcePojo source, List<DocumentPojo> toAdd, List<DocumentPojo> toUpdate, List<DocumentPojo> toRemove) {
_context = context;
if (_context.isStandalone()) {
maxDocsPerCycle = _context.getStandaloneMaxDocs();
}
try
{
// Defaults to some "normal" mode that involves trying to spot existing files that have been modified and re-creating their harvested docs
// In streaming mode it will just skip over those files and carry on
// (It should be particularly useful for custom mode, can just re-run the same job on he last day's data and the source will keep adding them)
if ((null != source.getFileConfig()) && (null != source.getFileConfig().mode)
&& (StreamingType.streaming == source.getFileConfig().mode))
{
_streaming = true;
}
//logger.debug("Source: " + source.getUrl());
//create new list for files
this.files = new LinkedList<DocumentPojo>();
this.docsToAdd = toAdd;
this.docsToUpdate = toUpdate;
this.docsToRemove = toRemove;
processFiles(source);
//harvested "successfully", post in mongo
String logMsg = (0 == errors)?(""):(new StringBuffer().append(errors).append(" file error(s).").toString());
_context.getHarvestStatus().update(source, new Date(), HarvestEnum.in_progress, logMsg, false, false);
}
catch (Exception e)
{
errors++;
_context.getHarvestStatus().update(source,new Date(),HarvestEnum.error,e.getMessage(), true, false);
}
finally {
// (ie these can be deleted once the harvest is complete)
this.files = null;
this.docsToAdd = null;
this.docsToUpdate = null;
this.docsToRemove = null;
}
}
// Renaming utility
private static String createNewName(InfiniteFile subFile, String replacement) throws MalformedURLException, UnsupportedEncodingException, URISyntaxException {
String path = subFile.getUrlString(); // (currently the entire string)
String name = subFile.getName();
int startOfName = path.lastIndexOf(name);
return replacement.replace("$name", name).replace("$path", path.substring(0, startOfName - 1));
}
/////////////////////////////////////////////////////////////////////////////////////
// Get tika options:
// Bonus option output:xhtml|text
// Bonus option bypass:<media type>
// Example option: "application/pdf:{setEnableAutoSpace:false}", ie format is mediaType:JSON
// where JSON is key/value pairs for the function name and the arg (only String, bool, int/long/double types are possible)
private void initializeTika(HarvestContext context, SourcePojo source)
{
AutoDetectParser autoDetectParser = new AutoDetectParser();
if (null != source.getFileConfig().XmlRootLevelValues) {
for (String s: source.getFileConfig().XmlRootLevelValues) {
int separator = s.indexOf(':');
String jsonStr = s.substring(separator + 1);
if (separator > 0) {
String mediaType = s.substring(0, separator);
if (mediaType.equalsIgnoreCase("output")) { //special case, just going to configure output
if (jsonStr.equalsIgnoreCase("xml") || jsonStr.equalsIgnoreCase("xhtml")) {
_tikaXmlFormatWriter = new StringWriter();
_tikaOutputFormat = getTransformerHandler("xml", _tikaXmlFormatWriter);
_tikaOutputParseContext = new ParseContext();
}
if (jsonStr.equalsIgnoreCase("html")) {
_tikaXmlFormatWriter = new StringWriter();
_tikaOutputFormat = getTransformerHandler("html", _tikaXmlFormatWriter);
_tikaOutputParseContext = new ParseContext();
}
continue;
}//TESTED
else if (mediaType.equalsIgnoreCase("bypass")) {
Map<MediaType, Parser> parsers = autoDetectParser.getParsers();
parsers.put(MediaType.parse(jsonStr), new TXTParser());
autoDetectParser.setParsers(parsers);
continue;
}
// Try to get media type parser:
Parser p = autoDetectParser.getParsers().get(MediaType.parse(mediaType));
while (p instanceof CompositeParser) {
p = ((CompositeParser)p).getParsers().get(MediaType.parse(mediaType));
}
if (null == p) {
context.getHarvestStatus().logMessage("Failed to find application type " + mediaType + " in tika option: " + s, true);
continue;
}//TESTED
// Get JSON objects and try to apply
try {
JsonElement jsonObj = new JsonParser().parse(jsonStr);
for (Map.Entry<String, JsonElement> keyVal: jsonObj.getAsJsonObject().entrySet()) {
if (keyVal.getValue().getAsJsonPrimitive().isBoolean()) { //boolean
try {
Method method = p.getClass().getMethod(keyVal.getKey(), Boolean.class);
method.invoke(p, (Boolean)keyVal.getValue().getAsJsonPrimitive().getAsBoolean());
}
catch (Exception e) {
try {
Method method = p.getClass().getMethod(keyVal.getKey(), Boolean.TYPE);
method.invoke(p, keyVal.getValue().getAsJsonPrimitive().getAsBoolean());
}
catch (Exception e2) {
context.getHarvestStatus().logMessage("Failed to invoke " + keyVal.getKey() + " in tika option: " + s, true);
continue;
}//TESTED
}
}//TESTED
if (keyVal.getValue().getAsJsonPrimitive().isString()) { //string
try {
Method method = p.getClass().getMethod(keyVal.getKey(), String.class);
method.invoke(p, keyVal.getValue().getAsJsonPrimitive().getAsString());
}
catch (Exception e) {
context.getHarvestStatus().logMessage("Failed to invoke " + keyVal.getKey() + " in tika option: " + s, true);
continue;
}
}//TESTED (cut and paste)
if (keyVal.getValue().getAsJsonPrimitive().isNumber()) { // number: int/long/double
// Loads of options: Integer.class, Integer.TYPE, Long.class, Long.TYPE, Double.long, Double.TYPE
boolean invoked = false;
if (!invoked) { // Int.class
try {
Method method = p.getClass().getMethod(keyVal.getKey(), Integer.class);
method.invoke(p, (Integer)keyVal.getValue().getAsJsonPrimitive().getAsInt());
invoked = true;
}
catch (Exception e) {}
}
if (!invoked) { // Int.type
try {
Method method = p.getClass().getMethod(keyVal.getKey(), Integer.TYPE);
method.invoke(p, keyVal.getValue().getAsJsonPrimitive().getAsInt());
invoked = true;
}
catch (Exception e) {}
}
if (!invoked) { // Long.class
try {
Method method = p.getClass().getMethod(keyVal.getKey(), Long.class);
method.invoke(p, (Long)keyVal.getValue().getAsJsonPrimitive().getAsLong());
invoked = true;
}
catch (Exception e) {}
}
if (!invoked) { // Long.type
try {
Method method = p.getClass().getMethod(keyVal.getKey(), Long.TYPE);
method.invoke(p, keyVal.getValue().getAsJsonPrimitive().getAsLong());
invoked = true;
}
catch (Exception e) {}
}
if (!invoked) { // Double.class
try {
Method method = p.getClass().getMethod(keyVal.getKey(), Double.class);
method.invoke(p, (Double)keyVal.getValue().getAsJsonPrimitive().getAsDouble());
invoked = true;
}
catch (Exception e) {}
}
if (!invoked) { // Double.type
try {
Method method = p.getClass().getMethod(keyVal.getKey(), Double.TYPE);
method.invoke(p, keyVal.getValue().getAsJsonPrimitive().getAsDouble());
invoked = true;
}
catch (Exception e) {}
}
}//TOTEST (all the different options)
}//(end loop over options)
}
catch (Exception e) {
context.getHarvestStatus().logMessage("Failed to parse JSON in tika option: " + s, true);
}//TESTED
}
else {
context.getHarvestStatus().logMessage("Failed to parse tika option: " + s, true);
}//TESTED
}//TESTED
}//(end if has options)
_tika = new Tika(TikaConfig.getDefaultConfig().getDetector(), autoDetectParser);
}//TESTED (apart from unused number option configuration)
// (See http://stackoverflow.com/questions/9051183/how-to-use-tikas-xwpfwordextractordecorator-class)
private static TransformerHandler getTransformerHandler(String method, StringWriter sw)
{
try {
SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, method);
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
handler.setResult(new StreamResult(sw));
return handler;
}
catch (Exception e) {
return null;
}
}//TESTED
}