Package org.apache.tika.io

Examples of org.apache.tika.io.TikaInputStream


        zout.put(finalName, data);

        count.increment();
      } else {
        if (inputStream instanceof TikaInputStream) {
          TikaInputStream tin = (TikaInputStreaminputStream;

          if (tin.getOpenContainer()!=null && tin.getOpenContainer() instanceof DirectoryEntry) {
            POIFSFileSystem fs = new POIFSFileSystem();
            copy((DirectoryEntry) tin.getOpenContainer(), fs.getRoot());
            ByteArrayOutputStream bos2 = new ByteArrayOutputStream();
            fs.writeFilesystem(bos2);
            bos2.close();

            zout.put(finalName, bos2.toByteArray());
View Full Code Here


       assertTrue(needle > -1);
       assertTrue(needle > pdfHaystack && pdfHaystack > outerHaystack);
      
       //plagiarized from POIContainerExtractionTest.  Thank you!
       TrackingHandler tracker = new TrackingHandler();
       TikaInputStream tis;
       ContainerExtractor ex = new ParserContainerExtractor();
       try{
          tis= TikaInputStream.get(getResourceAsStream("/test-documents/testPDFEmbeddingAndEmbedded.docx"));
          ex.extract(tis, ex, tracker);
       } finally {
View Full Code Here

        String xml = getXML("/testPDF_childAttachments.pdf").xml;
        //"regressiveness" exists only in Unit10.doc not in the container pdf document
        assertTrue(xml.contains("regressiveness"));

        TrackingHandler tracker = new TrackingHandler();
        TikaInputStream tis = null;
        ContainerExtractor ex = new ParserContainerExtractor();
        try{
            tis= TikaInputStream.get(
                getResourceAsStream("/test-documents/testPDF_childAttachments.pdf"));
            ex.extract(tis, ex, tracker);
        } finally {
            if (tis != null){
                tis.close();
            }
        }
        assertEquals(2, tracker.filenames.size());
        assertEquals(2, tracker.mediaTypes.size());
        assertEquals("Press Quality(1).joboptions", tracker.filenames.get(0));
View Full Code Here

public class TNEFParserTest extends AbstractPOIContainerExtractionTest {
   private static final String file = "testWINMAIL.dat";
  
   @Test
   public void testBasics() throws Exception {
      TikaInputStream stream = getTestFile(file);
      Detector detector = new DefaultDetector();
      try {
         assertEquals(
                 MediaType.application("vnd.ms-tnef"),
                 detector.detect(stream, new Metadata()));
     } finally {
         stream.close();
     }
   }
View Full Code Here

     }
   }
  
   @Test
   public void testMetadata() throws Exception {
      TikaInputStream stream = getTestFile(file);
     
      Metadata metadata = new Metadata();
      ContentHandler handler = new BodyContentHandler();
     
      TNEFParser tnef = new TNEFParser();
View Full Code Here

                       break;
                    default:
                       mimeType =  "image/unknown";
                       break;
                    }
                    TikaInputStream stream = TikaInputStream.get(blip.getPicturedata());
                   
                    // Handle the embeded resource
                    extractor.handleEmbeddedResource(
                          stream, null, mimeType,
                          handler, true
View Full Code Here

       xhtml.endElement("img");

       // Have we already output this one?
       // (Only expose each individual image once)
       if(! pictures.hasOutput(picture)) {
          TikaInputStream stream = TikaInputStream.get(picture.getContent());
          handleEmbeddedResource(stream, filename, mimeType, xhtml, false);
          pictures.recordOutput(picture);
       }
    }
View Full Code Here

    protected TrackingHandler process(String filename, ContainerExtractor extractor, boolean recurse) throws Exception {
        InputStream input = AbstractPOIContainerExtractionTest.class.getResourceAsStream(
             "/test-documents/" + filename);
        assertNotNull(filename + " not found", input);
       
        TikaInputStream stream = TikaInputStream.get(input);
        assertNotNull(stream);
       
        assertEquals(true, extractor.isSupported(stream));
       
        // Process it
View Full Code Here

    public void parse(
            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        TrueTypeFont font;
        TikaInputStream tis = TikaInputStream.get(stream);
        TTFParser parser = new TTFParser();
        if (tis.hasFile()) {
            font = parser.parseTTF(tis.getFile());
        } else {
            font = parser.parseTTF(stream);
        }

        metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
View Full Code Here

       // Is it an embedded OLE2 document, or an embedded OOXML document?
       try {
          Entry ooxml = dir.getEntry("Package");

          // It's OOXML
          TikaInputStream ooxmlStream = TikaInputStream.get(
                new DocumentInputStream((DocumentEntry)ooxml)
          );
          ZipContainerDetector detector = new ZipContainerDetector();
          MediaType type = detector.detect(ooxmlStream, new Metadata());
          handleEmbeddedResource(ooxmlStream, null, type.toString(), xhtml, true);
          return;
       } catch(FileNotFoundException e) {
          // It's regular OLE2
       }

       // Need to dump the directory out to a new temp file, so
       //  it's stand along
       POIFSFileSystem newFS = new POIFSFileSystem();
       copy(dir, newFS.getRoot());

       File tmpFile = File.createTempFile("tika", ".ole2");
       try {
           FileOutputStream out = new FileOutputStream(tmpFile);
           newFS.writeFilesystem(out);
           out.close();

           // What kind of document is it?
           Metadata metadata = new Metadata();
           POIFSDocumentType type = POIFSDocumentType.detectType(dir);
           metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());

           // Trigger for the document itself
           TikaInputStream embedded = TikaInputStream.get(tmpFile);
           try {
               if (extractor.shouldParseEmbedded(metadata)) {
                   extractor.parseEmbedded(embedded, xhtml, metadata, true);
               }
           } finally {
               embedded.close();
           }
       } finally {
           tmpFile.delete();
       }
    }
View Full Code Here

TOP

Related Classes of org.apache.tika.io.TikaInputStream

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.