Package org.apache.tika.extractor

Examples of org.apache.tika.extractor.EmbeddedDocumentExtractor


        if (embeddedFiles == null) {
            return;
        }

        EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class);
        if (embeddedExtractor == null) {
            embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context);
        }

        Map<String, COSObjectable> embeddedFileNames = embeddedFiles.getNames();
View Full Code Here


    public void parse(
            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws SAXException, IOException, TikaException {
        EmbeddedDocumentExtractor extractor =
            new EmbeddedDocumentExtractor(context);

        try {
            File file = TikaInputStream.get(stream).getFile();
            Archive archive = new Archive(file);

            metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
            XHTMLContentHandler xhtml =
                new XHTMLContentHandler(handler, metadata);
            xhtml.startDocument();
            for (FileHeader header : archive.getFileHeaders()) {
                Metadata entrydata = new Metadata();
                entrydata.set(
                        Metadata.RESOURCE_NAME_KEY,
                        header.getFileNameString());
                if (extractor.shouldParseEmbedded(entrydata)) {
                    extractor.parseEmbedded(stream, xhtml, entrydata, true);
                }
            }
            xhtml.endDocument();
        } catch (RarException e) {
            throw new TikaException("Unable to parse a RAR archive", e);
View Full Code Here

abstract class AbstractPOIFSExtractor {

    private final EmbeddedDocumentExtractor extractor;

    protected AbstractPOIFSExtractor(ParseContext context) {
        this.extractor = new EmbeddedDocumentExtractor(context);
    }
View Full Code Here

    public PackageExtractor(
            ContentHandler handler, Metadata metadata, ParseContext context) {
        this.handler = handler;
        this.metadata = metadata;
        this.extractor = new EmbeddedDocumentExtractor(context);
    }
View Full Code Here

    private final EmbeddedDocumentExtractor embeddedExtractor;

    public AbstractOOXMLExtractor(ParseContext context, POIXMLTextExtractor extractor) {
        this.extractor = extractor;

        EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);

        if (ex==null) {
            embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context);
        } else {
            embeddedExtractor = ex;
View Full Code Here

        if (!type.equals(MediaType.OCTET_STREAM)) {
            metadata.set(CONTENT_TYPE, type.toString());
        }

        // Use the delegate parser to parse the contained document
        EmbeddedDocumentExtractor extractor = context.get(
                EmbeddedDocumentExtractor.class,
                new ParsingEmbeddedDocumentExtractor(context));

        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
View Full Code Here

                }
                entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
            }

            // Use the delegate parser to parse the compressed document
            EmbeddedDocumentExtractor extractor = context.get(
                    EmbeddedDocumentExtractor.class,
                    new ParsingEmbeddedDocumentExtractor(context));
            if (extractor.shouldParseEmbedded(entrydata)) {
                extractor.parseEmbedded(cis, xhtml, entrydata, true);
            }
        } finally {
            cis.close();
        }
View Full Code Here

        if (names != null) {

            PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();
            if (embeddedFiles != null) {

                EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class);
                if (embeddedExtractor == null) {
                    embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context);
                }

                Map<String,Object> embeddedFileNames = embeddedFiles.getNames();

                if (embeddedFileNames != null) {
                    for (Map.Entry<String,Object> ent : embeddedFileNames.entrySet()) {
                        PDComplexFileSpecification spec = (PDComplexFileSpecification) ent.getValue();
                        PDEmbeddedFile file = spec.getEmbeddedFile();

                        Metadata metadata = new Metadata();
                        // TODO: other metadata?
                        metadata.set(Metadata.RESOURCE_NAME_KEY, ent.getKey());
                        metadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
                        metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));

                        if (embeddedExtractor.shouldParseEmbedded(metadata)) {
                            TikaInputStream stream = TikaInputStream.get(file.createInputStream());
                            try {
                                embeddedExtractor.parseEmbedded(
                                                                stream,
                                                                new EmbeddedContentHandler(handler),
                                                                metadata, false);
                            } finally {
                                stream.close();
View Full Code Here

    private TikaConfig tikaConfig;
    private MimeTypes mimeTypes;
    private Detector detector;

    protected AbstractPOIFSExtractor(ParseContext context) {
        EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);

        if (ex==null) {
            this.extractor = new ParsingEmbeddedDocumentExtractor(context);
        } else {
            this.extractor = ex;
View Full Code Here

abstract class AbstractPOIFSExtractor {

    private final EmbeddedDocumentExtractor extractor;

    protected AbstractPOIFSExtractor(ParseContext context) {
        EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);

        if (ex==null) {
            this.extractor = new ParsingEmbeddedDocumentExtractor(context);
        } else {
            this.extractor = ex;
View Full Code Here

TOP

Related Classes of org.apache.tika.extractor.EmbeddedDocumentExtractor

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.