Package org.apache.tika.io

Examples of org.apache.tika.io.TikaInputStream


       xhtml.endElement("img");

       // Have we already output this one?
       // (Only expose each individual image once)
       if(! pictures.hasOutput(picture)) {
          TikaInputStream stream = TikaInputStream.get(picture.getContent());
          handleEmbeddedResource(stream, filename, mimeType, xhtml, false);
          pictures.recordOutput(picture);
       }
    }
View Full Code Here


    private final Detector detector =
        new ContainerAwareDetector(MimeTypes.getDefaultMimeTypes());

    private void assertDetect(String file, String type) throws Exception {
        TikaInputStream stream = TikaInputStream.get(
                TestContainerAwareDetector.class.getResource(
                        "/test-documents/" + file));
        try {
            assertEquals(
                    MediaType.parse(type),
                    detector.detect(stream, new Metadata()));
        } finally {
            stream.close();
        }
    }
View Full Code Here

        assertDetect("testQUATTRO.qpw", "application/x-quattro-pro");
        assertDetect("testQUATTRO.wb3", "application/x-quattro-pro");
    }

    public void testOpenContainer() throws Exception {
        TikaInputStream stream = TikaInputStream.get(
                TestContainerAwareDetector.class.getResource(
                        "/test-documents/testPPT.ppt"));
        try {
            assertNull(stream.getOpenContainer());
            assertEquals(
                    MediaType.parse("application/vnd.ms-powerpoint"),
                    detector.detect(stream, new Metadata()));
            assertTrue(stream.getOpenContainer() instanceof NPOIFSFileSystem);
        } finally {
            stream.close();
        }
    }
View Full Code Here

    }

    private void assertRemovalTempfiles(String fileName) throws Exception {
        int numberOfTempFiles = countTemporaryFiles();

        TikaInputStream stream = TikaInputStream.get(
                TestContainerAwareDetector.class.getResource(
                        "/test-documents/" + fileName));
        try {
            detector.detect(stream, new Metadata());
        } finally {
            stream.close();
        }

        assertEquals(numberOfTempFiles, countTemporaryFiles());
    }
View Full Code Here

        }
    }

    public void testTruncatedFiles() throws Exception {
        // First up a truncated OOXML (zip) file
        TikaInputStream xlsx = getTruncatedFile("testEXCEL.xlsx", 300);
        try {
            assertEquals(
                    MediaType.application("x-tika-ooxml"),
                    detector.detect(xlsx, new Metadata()));
        } finally {
            xlsx.close();
        }

        // Now a truncated OLE2 file
        TikaInputStream xls = getTruncatedFile("testEXCEL.xls", 400);
        try {
            assertEquals(
                    MediaType.application("x-tika-msoffice"),
                    detector.detect(xls, new Metadata()));
        } finally {
            xls.close();
        }
   }
View Full Code Here

            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        TemporaryResources tmp = new TemporaryResources();
        try {
            TikaInputStream tis = TikaInputStream.get(stream, tmp);

            // Automatically detect the MIME type of the document
            MediaType type = detector.detect(tis, metadata);
            metadata.set(Metadata.CONTENT_TYPE, type.toString());
View Full Code Here

       
        try {
            OOXMLExtractor extractor;

            POIXMLTextExtractor poiExtractor;
            TikaInputStream tis = TikaInputStream.cast(stream);
            if (tis != null && tis.getOpenContainer() instanceof OPCPackage) {
                poiExtractor = ExtractorFactory.createExtractor(
                        (OPCPackage) tis.getOpenContainer());
            } else if (tis != null && tis.hasFile()) {
                poiExtractor = (POIXMLTextExtractor)
                        ExtractorFactory.createExtractor(tis.getFile());
            } else {
                InputStream shield = new CloseShieldInputStream(stream);
                poiExtractor = (POIXMLTextExtractor)
                        ExtractorFactory.createExtractor(shield);
            }
View Full Code Here

            System.out.println("Extracting '"+name+"' ("+contentType+")");

            FileOutputStream os = new FileOutputStream(outputFile);

            if (inputStream instanceof TikaInputStream) {
                TikaInputStream tin = (TikaInputStream) inputStream;

                if (tin.getOpenContainer() != null && tin.getOpenContainer() instanceof DirectoryEntry) {
                    POIFSFileSystem fs = new POIFSFileSystem();
                    copy((DirectoryEntry) tin.getOpenContainer(), fs.getRoot());
                    fs.writeFilesystem(os);
                } else {
                    IOUtils.copy(inputStream, os);
                }
            } else {
View Full Code Here

        //assertEquals("Stereo", metadata.get(XMPDM.AUDIO_CHANNEL_TYPE)); // TODO Extract
        assertEquals("M4A", metadata.get(XMPDM.AUDIO_COMPRESSOR));
       
       
        // Check again by file, rather than stream
        TikaInputStream tstream = TikaInputStream.get(
              MP4ParserTest.class.getResourceAsStream("/test-documents/testMP4.m4a"));
        tstream.getFile();
        try {
           parser.parse(tstream, handler, metadata, new ParseContext());
        } finally {
           tstream.close();
        }
    }
View Full Code Here

        if (dir.hasEntry("Package")) {
            // It's OOXML (has a ZipFile):
            Entry ooxml = dir.getEntry("Package");

            TikaInputStream stream = TikaInputStream.get(
                    new DocumentInputStream((DocumentEntry) ooxml));
            try {
                ZipContainerDetector detector = new ZipContainerDetector();
                MediaType type = detector.detect(stream, new Metadata());
                handleEmbeddedResource(stream, null, type.toString(), xhtml, true);
                return;
            } finally {
                stream.close();
            }
        }

        // It's regular OLE2:

        // What kind of document is it?
        Metadata metadata = new Metadata();
        POIFSDocumentType type = POIFSDocumentType.detectType(dir);
        TikaInputStream embedded = null;

        try {
            if (type == POIFSDocumentType.OLE10_NATIVE) {
                try {
                    // Try to un-wrap the OLE10Native record:
                    Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode)dir);
                    metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '/' + ole.getLabel());
                   
                    byte[] data = ole.getDataBuffer();
                    embedded = TikaInputStream.get(data);
                } catch (Ole10NativeException ex) {
                    // Not a valid OLE10Native record, skip it
                }
            } else if (type == POIFSDocumentType.COMP_OBJ) {
                try {
                   // Grab the contents and process
                   DocumentEntry contentsEntry = (DocumentEntry)dir.getEntry("CONTENTS");
                   DocumentInputStream inp = new DocumentInputStream(contentsEntry);
                   byte[] contents = new byte[contentsEntry.getSize()];
                   inp.readFully(contents);
                   embedded = TikaInputStream.get(contents);
                  
                   // Try to work out what it is
                   MediaType mediaType = getDetector().detect(embedded, new Metadata());
                   String extension = type.getExtension();
                   try {
                      MimeType mimeType = getMimeTypes().forName(mediaType.toString());
                      extension = mimeType.getExtension();
                   } catch(MimeTypeException mte) {
                      // No details on this type are known
                   }
                  
                   // Record what we can do about it
                   metadata.set(Metadata.CONTENT_TYPE, mediaType.getType().toString());
                   metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + extension);
                } catch(Exception e) {
                   throw new TikaException("Invalid embedded resource", e);
                }
            } else {
                metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
                metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + type.getExtension());
            }

            // Should we parse it?
            if (extractor.shouldParseEmbedded(metadata)) {
                if (embedded == null) {
                    // Make a TikaInputStream that just
                    // passes the root directory of the
                    // embedded document, and is otherwise
                    // empty (byte[0]):
                    embedded = TikaInputStream.get(new byte[0]);
                    embedded.setOpenContainer(dir);
                }
                extractor.parseEmbedded(embedded, xhtml, metadata, true);
            }
        } finally {
            if (embedded != null) {
                embedded.close();
            }
        }
    }
View Full Code Here

TOP

Related Classes of org.apache.tika.io.TikaInputStream

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.