Package org.apache.tika.parser

Examples of org.apache.tika.parser.RecursiveParserWrapper


   
    //TIKA-1010 test regular (not "embedded") images/picts
    public void testRegularImages() throws Exception {
        Parser base = new AutoDetectParser();
        ParseContext ctx = new ParseContext();
        RecursiveParserWrapper parser = new RecursiveParserWrapper(base,
                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
        ctx.set(org.apache.tika.parser.Parser.class, parser);
        TikaInputStream tis = null;
        ContentHandler handler = new BodyContentHandler();
        Metadata rootMetadata = new Metadata();
        rootMetadata.add(Metadata.RESOURCE_NAME_KEY, "testRTFRegularImages.rtf");
        try {
            tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFRegularImages.rtf"));
            parser.parse(tis, handler, rootMetadata, ctx);           
        } finally {
            tis.close();
        }
        List<Metadata> metadatas =  parser.getMetadata();

        Metadata meta_jpg_exif = metadatas.get(0);//("testJPEG_EXIF_\u666E\u6797\u65AF\u987F.jpg");
        Metadata meta_jpg = metadatas.get(2);//("testJPEG_\u666E\u6797\u65AF\u987F.jpg");
       
        assertTrue(meta_jpg_exif != null);
View Full Code Here


    }

    private void handleRecursiveJson(URL url, OutputStream output) throws IOException, SAXException, TikaException {
        Metadata metadata = new Metadata();
        InputStream input = TikaInputStream.get(url, metadata);
        RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser, getContentHandlerFactory(type));
        try {
            wrapper.parse(input, null, metadata, context);
        } finally {
            input.close();
        }
        JsonMetadataList.setPrettyPrinting(prettyPrint);
        Writer writer = getOutputWriter(output, encoding);
        try {
            JsonMetadataList.toJson(wrapper.getMetadata(), writer);
        } finally {
            writer.flush();
        }
    }
View Full Code Here

                    "There's a limit of "+MAX_MARK + " bytes for this type of processing in the GUI.\n"+
                    "Try the app with command line argument of -J."
            );
        }
        if (isReset) {
            RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser,
                    new BasicContentHandlerFactory(
                            BasicContentHandlerFactory.HANDLER_TYPE.BODY, -1));
            wrapper.parse(input, null, new Metadata(), new ParseContext());
            StringWriter jsonBuffer = new StringWriter();
            JsonMetadataList.setPrettyPrinting(true);
            JsonMetadataList.toJson(wrapper.getMetadata(), jsonBuffer);
            setText(json, jsonBuffer.toString());
        }
        layout.show(cards, "metadata");
    }
View Full Code Here

    public void testEmbeddedFilesInChildren() throws Exception {
        String xml = getXML("/testPDF_childAttachments.pdf").xml;
        //"regressiveness" exists only in Unit10.doc not in the container pdf document
        assertTrue(xml.contains("regressiveness"));

        RecursiveParserWrapper p = new RecursiveParserWrapper(new AutoDetectParser(),
                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
        TikaInputStream tis = null;
        ParseContext context = new ParseContext();
        PDFParserConfig config = new PDFParserConfig();
        config.setExtractInlineImages(true);
        config.setExtractUniqueInlineImagesOnly(false);
        context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config);
        context.set(org.apache.tika.parser.Parser.class, p);

        try {
            tis= TikaInputStream.get(
                    getResourceAsStream("/test-documents/testPDF_childAttachments.pdf"));
            p.parse(tis, new BodyContentHandler(-1), new Metadata(), context);
        } finally {
            if (tis != null) {
                tis.close();
            }
        }

        List<Metadata> metadatas = p.getMetadata();

        assertEquals(5, metadatas.size());
        assertNull(metadatas.get(0).get(Metadata.RESOURCE_NAME_KEY));
        assertEquals("image0.jpg", metadatas.get(1).get(Metadata.RESOURCE_NAME_KEY));
        assertEquals("Press Quality(1).joboptions", metadatas.get(3).get(Metadata.RESOURCE_NAME_KEY));
View Full Code Here

        config.setExtractInlineImages(true);
        config.setExtractUniqueInlineImagesOnly(false);

        Parser defaultParser = new AutoDetectParser();

        RecursiveParserWrapper p = new RecursiveParserWrapper(defaultParser,
                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
        ParseContext context = new ParseContext();
        context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config);
        context.set(org.apache.tika.parser.Parser.class, p);
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler(-1);
        String path = "/test-documents/testPDF_childAttachments.pdf";
        InputStream stream = TikaInputStream.get(this.getClass().getResource(path));

        p.parse(stream, handler, metadata, context);

        List<Metadata> metadatas = p.getMetadata();
        int inline = 0;
        int attach = 0;
        for (Metadata m : metadatas) {
            String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
            if (v != null) {
                if (v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())){
                    inline++;
                } else if (v.equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())){
                    attach++;
                }
            }
        }
        assertEquals(2, inline);
        assertEquals(2, attach);

        stream.close();
        p.reset();

        //now try turning off inline
        stream = TikaInputStream.get(this.getClass().getResource(path));

        context.set(org.apache.tika.extractor.DocumentSelector.class, new AvoidInlineSelector());
        inline = 0;
        attach = 0;
        handler = new BodyContentHandler(-1);
        metadata = new Metadata();
        p.parse(stream, handler, metadata, context);

        metadatas = p.getMetadata();
        for (Metadata m : metadatas) {
            String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
            if (v != null) {
                if (v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())){
                    inline++;
View Full Code Here

    @Test
    public void testInlineConfig() throws Exception {
       
        Parser defaultParser = new AutoDetectParser();
        RecursiveParserWrapper p = new RecursiveParserWrapper(defaultParser,
                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
        ParseContext context = new ParseContext();
        context.set(org.apache.tika.parser.Parser.class, p);
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler(-1);
        String path = "/test-documents/testPDF_childAttachments.pdf";
        InputStream stream = TikaInputStream.get(this.getClass().getResource(path));

        p.parse(stream, handler, metadata, context);

        List<Metadata> metadatas = p.getMetadata();
        int inline = 0;
        int attach = 0;
        for (Metadata m : metadatas) {
            String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
            if (v != null) {
                if (v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())){
                    inline++;
                } else if (v.equals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString())){
                    attach++;
                }
            }
        }
        assertEquals(0, inline);
        assertEquals(2, attach);

        stream.close();
        p.reset();

        //now try turning off inline
        stream = TikaInputStream.get(this.getClass().getResource(path));
        PDFParserConfig config = new PDFParserConfig();
        config.setExtractInlineImages(true);
        config.setExtractUniqueInlineImagesOnly(false);

        context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config);
        inline = 0;
        attach = 0;
        handler = new BodyContentHandler(-1);
        metadata = new Metadata();
        p.parse(stream, handler, metadata, context);

        metadatas = p.getMetadata();
        for (Metadata m : metadatas) {
            String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
            if (v != null) {
                if (v.equals(TikaCoreProperties.EmbeddedResourceType.INLINE.toString())){
                    inline++;
View Full Code Here

    @Test //TIKA-1376
    public void testEmbeddedFileNameExtraction() throws Exception {
        InputStream is = PDFParserTest.class.getResourceAsStream(
                "/test-documents/testPDF_multiFormatEmbFiles.pdf");
        RecursiveParserWrapper p = new RecursiveParserWrapper(
                new AutoDetectParser(),
                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
        Metadata m = new Metadata();
        ParseContext c = new ParseContext();
        c.set(org.apache.tika.parser.Parser.class, p);
        ContentHandler h = new BodyContentHandler();
        p.parse(is, h, m, c);
        is.close();
        List<Metadata> metadatas = p.getMetadata();
        assertEquals("metadata size", 5, metadatas.size());
        Metadata firstAttachment = metadatas.get(1);
        assertEquals("attachment file name", "Test.txt", firstAttachment.get(Metadata.RESOURCE_NAME_KEY));
    }
View Full Code Here

    @Test //TIKA-1374
    public void testOSSpecificEmbeddedFileExtraction() throws Exception {
        InputStream is = PDFParserTest.class.getResourceAsStream(
                "/test-documents/testPDF_multiFormatEmbFiles.pdf");
        RecursiveParserWrapper p = new RecursiveParserWrapper(
                new AutoDetectParser(),
                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
        Metadata m = new Metadata();
        ParseContext c = new ParseContext();
        c.set(org.apache.tika.parser.Parser.class, p);
        ContentHandler h = new BodyContentHandler();
        p.parse(is, h, m, c);
        is.close();
        List<Metadata> metadatas = p.getMetadata();
        assertEquals("metadata size", 5, metadatas.size());

        assertEquals("file name", "Test.txt", metadatas.get(1).get(Metadata.RESOURCE_NAME_KEY));
        assertContains("os specific", metadatas.get(1).get(RecursiveParserWrapper.TIKA_CONTENT));
        assertEquals("file name", "TestMac.txt", metadatas.get(2).get(Metadata.RESOURCE_NAME_KEY));
View Full Code Here

TOP

Related Classes of org.apache.tika.parser.RecursiveParserWrapper

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.