Package org.apache.tika.sax

Examples of org.apache.tika.sax.BodyContentHandler


    }

    @Test
    public void testExcel() throws Exception {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        ParseContext context = new ParseContext();
        context.set(Locale.class, Locale.US);

        InputStream input = getTestDocument("testEXCEL.xlsx");
        try {
            parser.parse(input, handler, metadata, context);

            assertEquals(
                    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
                    metadata.get(Metadata.CONTENT_TYPE));
            assertEquals("Simple Excel document", metadata.get(TikaCoreProperties.TITLE));
            assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
            assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
            String content = handler.toString();
            assertTrue(content.contains("Sample Excel Worksheet"));
            assertTrue(content.contains("Numbers and their Squares"));
            assertTrue(content.contains("9"));
            assertFalse(content.contains("9.0"));
            assertTrue(content.contains("196"));
View Full Code Here


    }

    @Test
    public void testExcelFormats() throws Exception {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        ParseContext context = new ParseContext();
        context.set(Locale.class, Locale.US);

        InputStream input = getTestDocument("testEXCEL-formats.xlsx");
        try {
            parser.parse(input, handler, metadata, context);

            assertEquals(
                    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
                    metadata.get(Metadata.CONTENT_TYPE));

            String content = handler.toString();

            // Number #,##0.00
            assertTrue(content.contains("1,599.99"));
            assertTrue(content.contains("-1,599.99"));
View Full Code Here

            Parser parser = new AutoDetectParser();
            Metadata metadata = new Metadata();
            // TODO: should auto-detect without the resource name
            metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
            ContentHandler handler = new BodyContentHandler();
            ParseContext context = new ParseContext();
   
            InputStream input = getTestDocument(filename);
            try {
                parser.parse(input, handler, metadata, context);
   
                assertEquals(
                        "Mime-type checking for " + filename,
                        mimeTypes[i],
                        metadata.get(Metadata.CONTENT_TYPE));
                assertEquals("Attachment Test", metadata.get(TikaCoreProperties.TITLE));
                assertEquals("Rajiv", metadata.get(TikaCoreProperties.CREATOR));
                assertEquals("Rajiv", metadata.get(Metadata.AUTHOR));
               
                String content = handler.toString();
                // Theme files don't have the text in them
                if(extension.equals("thmx")) {
                    assertEquals("", content);
                } else {
                    assertTrue(
View Full Code Here

            // TODO: should auto-detect without the resource name
            metadata.set(Metadata.RESOURCE_NAME_KEY, filename);

      // Allow the value to be access from the inner class
      final int currentI = i;
            ContentHandler handler = new BodyContentHandler()
    {
        public void startDocument ()
        {
      assertEquals(
             "Mime-type checking for " + filename,
View Full Code Here

public class TarParserTest extends AbstractPkgTest {

    @Test
    public void testTarParsing() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();

        InputStream stream = TarParserTest.class.getResourceAsStream(
                "/test-documents/test-documents.tar");
        try {
            parser.parse(stream, handler, metadata, recursingContext);
        } finally {
            stream.close();
        }

        assertEquals("application/x-tar", metadata.get(Metadata.CONTENT_TYPE));
        String content = handler.toString();
        assertTrue(content.contains("test-documents/testEXCEL.xls"));
        assertTrue(content.contains("Sample Excel Worksheet"));
        assertTrue(content.contains("test-documents/testHTML.html"));
        assertTrue(content.contains("Test Indexation Html"));
        assertTrue(content.contains("test-documents/testOpenOffice2.odt"));
View Full Code Here

          String filename = "testPPT." + extension;

          Parser parser = new AutoDetectParser();
          Metadata metadata = new Metadata();
          metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
          ContentHandler handler = new BodyContentHandler();
          ParseContext context = new ParseContext();
 
          InputStream input = getTestDocument(filename);
          try {
              parser.parse(input, handler, metadata, context);
View Full Code Here

     *  fired for all the embedded entries.
     */
    @Test
    public void testEmbedded() throws Exception {
       Parser parser = new AutoDetectParser(); // Should auto-detect!
       ContentHandler handler = new BodyContentHandler();
       Metadata metadata = new Metadata();

       InputStream stream = ZipParserTest.class.getResourceAsStream(
               "/test-documents/test-documents.tar");
       try {
View Full Code Here

     * @throws Exception
     */
    @Test
    public void testWord() throws Exception {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        ParseContext context = new ParseContext();

        InputStream input = getTestDocument("testWORD.docx");
        try {
            parser.parse(input, handler, metadata, context);
            assertEquals(
                    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
                    metadata.get(Metadata.CONTENT_TYPE));
            assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE));
            assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
            assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
            assertTrue(handler.toString().contains("Sample Word Document"));
        } finally {
            input.close();
        }
    }
View Full Code Here

     * @throws Exception
     */
    @Test
    public void testWordFootnote() throws Exception {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        ParseContext context = new ParseContext();

        InputStream input = getTestDocument("footnotes.docx");
        try {
            parser.parse(input, handler, metadata, context);
            assertEquals(
                    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
                    metadata.get(Metadata.CONTENT_TYPE));
            assertTrue(handler.toString().contains("snoska"));
        } finally {
            input.close();
        }
    }
View Full Code Here

        InputStream input = OOXMLParserTest.class
                .getResourceAsStream("/test-documents/protectedSheets.xlsx");

        Parser parser = new AutoDetectParser();
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        ParseContext context = new ParseContext();

        try {
            parser.parse(input, handler, metadata, context);
View Full Code Here

TOP

Related Classes of org.apache.tika.sax.BodyContentHandler

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.