Source Code of com.findwise.hydra.stage.tika.TextExtractionStage

package com.findwise.hydra.stage.tika;


import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;


import com.findwise.utils.tika.InputStreamParser;
import com.findwise.utils.tika.ParsedData;
import org.apache.tika.exception.TikaException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;


import com.findwise.hydra.local.LocalDocument;
import com.findwise.hydra.stage.AbstractProcessStage;
import com.findwise.hydra.stage.Parameter;
import com.findwise.hydra.stage.RequiredArgumentMissingException;
import com.findwise.hydra.stage.Stage;


@Stage(description = "A stage that fetches the content from a given url and appends it the the document")
public class TextExtractionStage extends AbstractProcessStage {
    private static Logger logger = LoggerFactory.getLogger(TextExtractionStage.class);


    @Parameter(description = "The max size to be fetched. Default: -1 = unlimited")
    private long maxSizeInBytes = -1;
    @Parameter(required = true, description = "The prefix to add to the metadata fields when adding them to the content")
    private String metadataPrefix;
    @Parameter(required = true, description = "The field name where the extracted content will be stored")
    private String contentField;
    @Parameter(required = true, description = "The field where the url can be found")
    private String urlField;
    @Parameter(description = "The field where the file size can be found")
    private String fileSizeField;
    @Parameter(description = "The allowed file formats. Default: null = all")
    private List<String> allowedFileFormats = null;
    @Parameter(description = "The field where the file format can be found")
    private String fileFormatField;


    private Set<String> lowerCaseAllowedFileFormatsSet = null;


    @Override
    public void process(LocalDocument doc) {
        InputStream stream = null;
        long size = getFileSize(doc);


        if (!okFileSize(size)) {
            logger.debug("File size was not ok. Skipping");
            return;
        }


        String fileFormat = getFileFormat(doc);
        if (!okFileFormat(fileFormat)) {
            logger.debug("File format " + fileFormat + " was not an allowed file format");
            return;
        }


        String url = getUrl(doc);


        try {
            stream = getStreamFromUrl(url);
        } catch (IOException e) {
            logger.warn("Failed to open stream to url: " + url, e);
            return;
        }


        try {
            enrichDocumentWithFileContents(doc, stream);
        } catch (Exception e) {
            logger.warn(
                    "The parser experienced a problem. " + "The data from the specified file will not be included.", e);
            return;
        } finally {
            try {
                stream.close();
            } catch (IOException e) {
                logger.warn("Failed to close stream. Was it never opened?");
            }
        }


    }


    boolean okFileFormat(String fileFormat) {
        if (lowerCaseAllowedFileFormatsSet == null) {
            return true;
        }
        if (fileFormat == null) {
            return false;
        }


        return lowerCaseAllowedFileFormatsSet.contains(fileFormat.toLowerCase());
    }


    String getFileFormat(LocalDocument doc) {
        Object fileFormatObject = doc.getContentField(fileFormatField);


        return fileFormatHelper(fileFormatObject, 0);
    }


    private String fileFormatHelper(Object fileFormatObject, int depth) {
        if (fileFormatObject == null || depth > 1)
            return null;


        if (fileFormatObject instanceof List<?>) {
            if (((List<?>) fileFormatObject).size() > 0) {
                return fileFormatHelper(((List<?>) fileFormatObject).get(0), depth + 1);
            } else {
                return null;
            }
        }
        if (fileFormatObject instanceof String) {
            return (String) fileFormatObject;
        }


        logger.debug("Failed to parse fileFormat");
        return null;
    }


    long getFileSize(LocalDocument doc) {
        Object fileSizeObject = doc.getContentField(fileSizeField);


        return fileSizeHelper(fileSizeObject, 0);
    }


    private long fileSizeHelper(Object fileSizeObject, int depth) {
        if (fileSizeObject == null || depth > 1)
            return -1;


        if (fileSizeObject instanceof List<?>) {
            if (((List<?>) fileSizeObject).size() > 0) {
                return fileSizeHelper(((List<?>) fileSizeObject).get(0), depth + 1);
            } else {
                return -1;
            }
        }
        if (fileSizeObject instanceof String) {
            return Long.parseLong((String) fileSizeObject);
        }
        if (fileSizeObject instanceof Number) {
            return ((Number) fileSizeObject).longValue();
        }


        logger.warn("File size could not be parsed");
        return -1;
    }


    boolean okFileSize(long size) {
        if (maxSizeInBytes < 0)
            return true;


        return (size <= maxSizeInBytes);
    }


    private void enrichDocumentWithFileContents(LocalDocument doc, InputStream stream) throws IOException,
            SAXException, TikaException {
        InputStreamParser inputStreamParser = new InputStreamParser();
        ParsedData parsedData = inputStreamParser.parse(stream);


        addTextToDocument(doc, parsedData.getContent());
        addMetadataToDocument(doc, parsedData.getMetadata());
    }


    void addTextToDocument(LocalDocument doc, String textData) {
        doc.putContentField(contentField, textData);
    }


    void addMetadataToDocument(LocalDocument doc, Map<String, Object> metadata) {
        for (String name : metadata.keySet()) {
            doc.putContentField(metadataPrefix + name, metadata.get(name));
        }
    }


    InputStream getStreamFromUrl(String stringUrl) throws IOException {
        URL url = new URL(stringUrl);
        InputStream in = url.openStream();
        return in;
    }


    @Override
    public void init() throws RequiredArgumentMissingException {
        if (metadataPrefix == null) {
            throw new RequiredArgumentMissingException("Missing required configuration: metadataPrefix");
        }
        if (contentField == null) {
            throw new RequiredArgumentMissingException("Missing required configuration: contentField");
        }
        if (urlField == null) {
            throw new RequiredArgumentMissingException("Missing required configuration: urlField");
        }
        if (fileSizeField == null && maxSizeInBytes > 0) {
            throw new RequiredArgumentMissingException(
                    "Missing required configuration: fileSizeField - FileSizeField must be set when maxSizeInBytes is set");
        }
        if (allowedFileFormats != null && fileFormatField == null) {
            throw new RequiredArgumentMissingException(
                    "Missing required configuration: fileFormatField - fileFormatField must be set when allowedFileFormats is set");
        }


        setLowerCaseAllowedFileFormats();
    }


    void setLowerCaseAllowedFileFormats() {
        if (allowedFileFormats == null || allowedFileFormats.size() <= 0)
            return;
        lowerCaseAllowedFileFormatsSet = new HashSet<String>();
        for (String allowedFileFormat : allowedFileFormats) {
            lowerCaseAllowedFileFormatsSet.add(allowedFileFormat.toLowerCase());
        }
    }


    /**
     * Fetches a url from the field specified by the urlField.
     * <p/>
     * If the field is a List<String> returns the FIRST String. The rest will be
     * ignored.
     *
     * @param doc The document to get the url from
     * @return The String in if it is a String, the first String if it is a
     * List<String> or null otherwise
     */
    String getUrl(LocalDocument doc) {
        Object urlObject = doc.getContentField(urlField);


        if (urlObject == null) {
            return null;
        }


        if (urlObject instanceof String) {
            return (String) urlObject;
        } else if (urlObject instanceof List<?>) {
            List<?> urlList = (List<?>) urlObject;
            if (!urlList.isEmpty()) {
                if (urlList.get(0) instanceof String) {
                    return (String) urlList.get(0);
                } else {
                    logger.warn("List in " + urlField + " did not contain Strings. Skipping");
                    return null;
                }
            } else {
                logger.warn("List was empty. Skipping");
            }
        } else {
            logger.warn(urlField + " did not contain String nor List. Skipping");
        }


        return null;
    }


    public void setUrlField(String urlField) {
        this.urlField = urlField;
    }


    public void setContentField(String contentField) {
        this.contentField = contentField;
    }


    public String getContentField() {
        return contentField;
    }


    public void setMetadatPrefix(String prefix) {
        this.metadataPrefix = prefix;
    }


    public String getMetadataPrefix() {
        return metadataPrefix;
    }


    public long getMaxSizeInBytes() {
        return maxSizeInBytes;
    }


    public void setMaxSizeInBytes(long size) {
        this.maxSizeInBytes = size;
    }


    public String getFileSizeField() {
        return fileSizeField;
    }


    public void setFileSizeField(String fileSizeField) {
        this.fileSizeField = fileSizeField;
    }


    public String getFileFormatField() {
        return fileFormatField;
    }


    public void setFileFormatField(String fileFormatField) {
        this.fileFormatField = fileFormatField;
    }


    public List<String> getAllowedFileFormats() {
        return allowedFileFormats;
    }


    public void setAllowedFileFormats(List<String> allowedFileFormats) throws RequiredArgumentMissingException {
        this.allowedFileFormats = allowedFileFormats;
    }
}
Source Code of com.findwise.hydra.stage.tika.TextExtractionStage

Related Classes of com.findwise.hydra.stage.tika.TextExtractionStage