Package org.apache.tika.sax

Examples of org.apache.tika.sax.BodyContentHandler


    public void testZipBombPrevention() throws Exception {
        InputStream tgz = AutoDetectParserTest.class.getResourceAsStream(
                "/test-documents/TIKA-216.tgz");
        try {
            Metadata metadata = new Metadata();
            ContentHandler handler = new BodyContentHandler(-1);
            new AutoDetectParser(tika).parse(tgz, handler, metadata);
            fail("Zip bomb was not detected");
        } catch (TikaException e) {
            // expected
        } finally {
View Full Code Here


             fail("Could not find test file " + file);
          }
         
          try {
             Metadata metadata = new Metadata();
             ContentHandler handler = new BodyContentHandler();
             new AutoDetectParser(tika).parse(input, handler, metadata);

             assertEquals("Incorrect content type for " + file,
                   mimetypes[i], metadata.get(Metadata.CONTENT_TYPE));

             // Check some of the common metadata
             assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
             assertEquals("Test Title", metadata.get(Metadata.TITLE));
//             assertEquals("Test Artist", metadata.get(TikaCoreProperties.AUTHOR));
//             assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
            
             // Check some of the XMPDM metadata
             assertEquals("Test Album", metadata.get(XMPDM.ALBUM));
             assertEquals("Test Artist", metadata.get(XMPDM.ARTIST));
             assertEquals("Stereo", metadata.get(XMPDM.AUDIO_CHANNEL_TYPE));
             assertEquals("44100", metadata.get(XMPDM.AUDIO_SAMPLE_RATE));
            
             // Check some of the text
             String content = handler.toString();
             assertTrue(content.contains("Test Title"));
             assertTrue(content.contains("Test Artist"));
          } finally {
             input.close();
          }
View Full Code Here

    public void testSpecificParserList() throws Exception {
        AutoDetectParser parser = new AutoDetectParser(new MyDetector(), new MyParser());
       
        InputStream is = new ByteArrayInputStream("test".getBytes());
        Metadata metadata = new Metadata();
        parser.parse(is, new BodyContentHandler(), metadata, new ParseContext());
       
        assertEquals("value", metadata.get("MyParser"));
    }
View Full Code Here

public class Pkcs7ParserTest extends TikaTest {
    public void testDetachedSignature() throws Exception {
        InputStream input = Pkcs7ParserTest.class.getResourceAsStream(
                "/test-documents/testDetached.p7s");
        try {
            ContentHandler handler = new BodyContentHandler();
            Metadata metadata = new Metadata();
            new Pkcs7Parser().parse(input, handler, metadata, new ParseContext());
        } catch (NullPointerException npe) {
            fail("should not get NPE");
        } catch (TikaException te) {
View Full Code Here

public class AdobeFontMetricParserTest {
 
    @Test
    public void testAdobeFontMetricParsing() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();
        ParseContext context = new ParseContext();
        TikaInputStream stream = TikaInputStream.get(
                AdobeFontMetricParserTest.class.getResource(
                        "/test-documents/testAFM.afm"));

        try {
            parser.parse(stream, handler, metadata, context);
        } finally {
            stream.close();
        }

        assertEquals("application/x-font-adobe-metric", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("TestFullName", metadata.get(TikaCoreProperties.TITLE));
        assertEquals("Fri Jul 15 17:50:51 2011", metadata.get(Metadata.CREATION_DATE));
       
        assertEquals("TestFontName", metadata.get("FontName"));
        assertEquals("TestFullName", metadata.get("FontFullName"));
        assertEquals("TestSymbol",   metadata.get("FontFamilyName"));
       
        assertEquals("Medium",  metadata.get("FontWeight"));
        assertEquals("001.008", metadata.get("FontVersion"));

        String content = handler.toString();

        // Test that the comments got extracted
        assertTrue(content.contains("Comments"));
        assertTrue(content.contains("This is a comment in a sample file"));
        assertTrue(content.contains("UniqueID 12345"));
View Full Code Here

       InputStream input = PowerPointParserTest.class.getResourceAsStream(
             "/test-documents/testPPT_custom_props.ppt");
       Metadata metadata = new Metadata();
      
       try {
          ContentHandler handler = new BodyContentHandler(-1);
          ParseContext context = new ParseContext();
          context.set(Locale.class, Locale.US);
          new OfficeParser().parse(input, handler, metadata, context);
       } finally {
          input.close();
View Full Code Here

    @Test
    public void testParseAscii() throws Exception {
        String path = "/test-documents/testHTML.html";
        final StringWriter href = new StringWriter();
        final StringWriter name = new StringWriter();
        ContentHandler body = new BodyContentHandler();
        Metadata metadata = new Metadata();
        InputStream stream = HtmlParserTest.class.getResourceAsStream(path);
        try {
            ContentHandler link = new DefaultHandler() {
                @Override
                public void startElement(
                        String u, String l, String n, Attributes a)
                        throws SAXException {
                    if ("a".equals(l)) {
                        if (a.getValue("href") != null) {
                            href.append(a.getValue("href"));
                        } else if (a.getValue("name") != null) {
                            name.append(a.getValue("name"));
                        }
                    }
                }
            };
            new HtmlParser().parse(
                    stream, new TeeContentHandler(body, link),
                    metadata, new ParseContext());
        } finally {
            stream.close();
        }

        assertEquals(
                "Title : Test Indexation Html", metadata.get(TikaCoreProperties.TITLE));
        assertEquals("Tika Developers", metadata.get("Author"));
        assertEquals("5", metadata.get("refresh"));

        assertEquals("51.2312", metadata.get(Geographic.LATITUDE));
        assertEquals("-5.1987", metadata.get(Geographic.LONGITUDE));

        assertEquals("http://www.apache.org/", href.toString());
        assertEquals("test-anchor", name.toString());

        String content = body.toString();
        assertTrue(
                "Did not contain expected text:" + "Test Indexation Html",
                content.contains("Test Indexation Html"));
        assertTrue(
                "Did not contain expected text:" + "Indexation du fichier",
View Full Code Here

        assertTrue(content.contains("an XHTML document"));
    }

    @Test
    public void testParseEmpty() throws Exception {
        ContentHandler handler = new BodyContentHandler();
        new HtmlParser().parse(
                new ByteArrayInputStream(new byte[0]),
                handler,  new Metadata(), new ParseContext());
        assertEquals("", handler.toString());
    }
View Full Code Here

            + "<title>the name is \u00e1ndre</title>"
            + "</head><body></body></html>";
        Metadata metadata = new Metadata();
        new HtmlParser().parse (
                new ByteArrayInputStream(test.getBytes("ISO-8859-1")),
                new BodyContentHandler(),  metadata, new ParseContext());
        assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
    }
View Full Code Here

                + "<title>the name is \u00e1ndre</title>"
                + "</head><body></body></html>";
        Metadata metadata = new Metadata();
        new HtmlParser().parse(
                new ByteArrayInputStream(test.getBytes("ISO-8859-1")),
                new BodyContentHandler(), metadata, new ParseContext());
        assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING));
    }
View Full Code Here

TOP

Related Classes of org.apache.tika.sax.BodyContentHandler

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.