Package org.apache.tika.extractor

Examples of org.apache.tika.extractor.EmbeddedDocumentExtractor


        try {
            writeParagraphEnd();

            extractImages(page.getResources());

            EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor();
            for (PDAnnotation annotation : page.getAnnotations()) {

                if (annotation instanceof PDAnnotationFileAttachment){
                    PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
                    PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile();
View Full Code Here


                }

                metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                        TikaCoreProperties.EmbeddedResourceType.INLINE.toString());

                EmbeddedDocumentExtractor extractor =
                        getEmbeddedDocumentExtractor();
                if (extractor.shouldParseEmbedded(metadata)) {
                    ByteArrayOutputStream buffer = new ByteArrayOutputStream();
                    try {
                        image.write2OutputStream(buffer);
                        image.clear();
                        extractor.parseEmbedded(
                                new ByteArrayInputStream(buffer.toByteArray()),
                                new EmbeddedContentHandler(handler),
                                metadata, false);
                    } catch (IOException e) {
                        // could not extract this image, so just skip it...
View Full Code Here

        }
        resources.clear();
    }

    protected EmbeddedDocumentExtractor getEmbeddedDocumentExtractor() {
        EmbeddedDocumentExtractor extractor =
                context.get(EmbeddedDocumentExtractor.class);
        if (extractor == null) {
            extractor = new ParsingEmbeddedDocumentExtractor(context);
        }
        return extractor;
View Full Code Here

            throws IOException, SAXException, TikaException {
        if (embeddedFileNames == null || embeddedFileNames.isEmpty()) {
            return;
        }

        EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor();
        for (Map.Entry<String,COSObjectable> ent : embeddedFileNames.entrySet()) {
            PDComplexFileSpecification spec = (PDComplexFileSpecification) ent.getValue();
            extractMultiOSPDEmbeddedFiles(ent.getKey(), spec, extractor);
        }
    }
View Full Code Here

    private MimeTypes mimeTypes;
    private Detector detector;
    private static final Log logger = LogFactory.getLog(AbstractPOIFSExtractor.class);

    protected AbstractPOIFSExtractor(ParseContext context) {
        EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);

        if (ex==null) {
            this.extractor = new ParsingEmbeddedDocumentExtractor(context);
        } else {
            this.extractor = ex;
View Full Code Here

TOP

Related Classes of org.apache.tika.extractor.EmbeddedDocumentExtractor

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.