Package org.apache.tika.extractor

Examples of org.apache.tika.extractor.ParserContainerExtractor$RecursiveParser


  
    /**
     * For office files which don't have anything embedded in them
     */
    public void testWithoutEmbedded() throws Exception {
       ContainerExtractor extractor = new ParserContainerExtractor();
      
       String[] files = new String[] {
             "testEXCEL.xls", "testWORD.doc", "testPPT.ppt",
             "testVISIO.vsd", "test-outlook.msg"
       };
View Full Code Here


    /**
     * Office files with embedded images, but no other
     *  office files in them
     */
    public void testEmbeddedImages() throws Exception {
       ContainerExtractor extractor = new ParserContainerExtractor();
       TrackingHandler handler;
      
       // Excel with 1 image
       handler = process("testEXCEL_1img.xls", extractor, false);
       assertEquals(1, handler.filenames.size());
View Full Code Here

     *       -> powerpoint
     *       -> excel
     *           -> image
     */
    public void testEmbeddedOfficeFiles() throws Exception {
       ContainerExtractor extractor = new ParserContainerExtractor();
       TrackingHandler handler;
      
      
       // Excel with a word doc and a powerpoint doc, both of which have images in them
       // Without recursion, should see both documents + the images
View Full Code Here

    @Override
    protected void setUp() throws Exception {
       ContainerAwareDetector detector = new ContainerAwareDetector(
             (new TikaConfig()).getMimeRepository()
       );
       extractor = new ParserContainerExtractor(
             new AutoDetectParser(detector), detector
       );
    }
View Full Code Here

    /**
     * Check the Rtf and Attachments are returned
     *  as expected
     */
    public void testBodyAndAttachments() throws Exception {
       ContainerExtractor extractor = new ParserContainerExtractor();
      
       // Process it with recursing
       // Will have the message body RTF and the attachments
       TrackingHandler handler = process(file, extractor, true);
       assertEquals(6, handler.filenames.size());
View Full Code Here

  
    /**
     * For office files which don't have anything embedded in them
     */
    public void testWithoutEmbedded() throws Exception {
       ContainerExtractor extractor = new ParserContainerExtractor();
      
       String[] files = new String[] {
             "testEXCEL.xls", "testWORD.doc", "testPPT.ppt",
             "testVISIO.vsd", "test-outlook.msg"
       };
View Full Code Here

    /**
     * Office files with embedded images, but no other
     *  office files in them
     */
    public void testEmbeddedImages() throws Exception {
       ContainerExtractor extractor = new ParserContainerExtractor();
       TrackingHandler handler;
      
       // Excel with 1 image
       handler = process("testEXCEL_1img.xls", extractor, false);
       assertEquals(1, handler.filenames.size());
View Full Code Here

     *       -> powerpoint
     *       -> excel
     *           -> image
     */
    public void testEmbeddedOfficeFiles() throws Exception {
       ContainerExtractor extractor = new ParserContainerExtractor();
       TrackingHandler handler;
      
      
       // Excel with a word doc and a powerpoint doc, both of which have images in them
       // Without recursion, should see both documents + the images
View Full Code Here

       assertEquals("smbprn.00009008.KdcPjl.pdf", handler.filenames.get(1));
       assertEquals(TYPE_PDF, handler.mediaTypes.get(1));
    }

    public void testEmbeddedOfficeFilesXML() throws Exception {
        ContainerExtractor extractor = new ParserContainerExtractor();
        TrackingHandler handler;

        handler = process("EmbeddedDocument.docx", extractor, false);
        assertTrue(handler.filenames.contains("Microsoft_Office_Excel_97-2003_Worksheet1.bin"));
        assertEquals(2, handler.filenames.size());
View Full Code Here

    /**
     * For office files which don't have anything embedded in them
     */
    @Test
    public void testWithoutEmbedded() throws Exception {
       ContainerExtractor extractor = new ParserContainerExtractor();
      
       String[] files = new String[] {
             "testEXCEL.xls", "testWORD.doc", "testPPT.ppt",
             "testVISIO.vsd", "test-outlook.msg"
       };
View Full Code Here

TOP

Related Classes of org.apache.tika.extractor.ParserContainerExtractor$RecursiveParser

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.