Package org.apache.tika.sax

Examples of org.apache.tika.sax.BodyContentHandler


       InputStream input = OOXMLParserTest.class.getResourceAsStream(
             "/test-documents/testWORD_custom_props.docx");
       Metadata metadata = new Metadata();

       try {
          ContentHandler handler = new BodyContentHandler(-1);
          ParseContext context = new ParseContext();
          context.set(Locale.class, Locale.US);
          new OOXMLParser().parse(input, handler, metadata, context);
       } finally {
          input.close();
View Full Code Here


       InputStream input = OOXMLParserTest.class.getResourceAsStream(
             "/test-documents/testPPT_custom_props.pptx");
       Metadata metadata = new Metadata();

       try {
          ContentHandler handler = new BodyContentHandler(-1);
          ParseContext context = new ParseContext();
          context.set(Locale.class, Locale.US);
          new OOXMLParser().parse(input, handler, metadata, context);
       } finally {
          input.close();
View Full Code Here

        }
    }

    @Test
    public void testVarious() throws Exception {
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();

        InputStream stream = WordParserTest.class.getResourceAsStream(
                "/test-documents/testWORD_various.doc");
        try {
            new OfficeParser().parse(stream, handler, metadata, new ParseContext());
        } finally {
            stream.close();
        }

        String content = handler.toString();
        //content = content.replaceAll("\\s+"," ");
        assertContains("Footnote appears here", content);
        assertContains("This is a footnote.", content);
        assertContains("This is the header text.", content);
        assertContains("This is the footer text.", content);
View Full Code Here

     * TIKA-1044 - Handle word documents where parts of the
     *  text have no formatting or styles applied to them
     */
    @Test
    public void testNoFormat() throws Exception {
       ContentHandler handler = new BodyContentHandler();
       Metadata metadata = new Metadata();

       InputStream stream = WordParserTest.class.getResourceAsStream(
               "/test-documents/testWORD_no_format.docx");
       try {
          new OOXMLParser().parse(stream, handler, metadata, new ParseContext());
       } finally {
           stream.close();
       }

       String content = handler.toString();
       assertContains("This is a piece of text that causes an exception", content);
    }
View Full Code Here

public class Bzip2ParserTest extends AbstractPkgTest {

    @Test
    public void testBzip2Parsing() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();

        InputStream stream = Bzip2ParserTest.class.getResourceAsStream(
                "/test-documents/test-documents.tbz2");
        try {
            parser.parse(stream, handler, metadata, recursingContext);
        } finally {
            stream.close();
        }

        assertEquals("application/x-bzip2", metadata.get(Metadata.CONTENT_TYPE));
        String content = handler.toString();
        assertTrue(content.contains("test-documents/testEXCEL.xls"));
        assertTrue(content.contains("Sample Excel Worksheet"));
        assertTrue(content.contains("test-documents/testHTML.html"));
        assertTrue(content.contains("Test Indexation Html"));
        assertTrue(content.contains("test-documents/testOpenOffice2.odt"));
View Full Code Here

     * TIKA-1044 - Handle documents where parts of the
     *  text have no formatting or styles applied to them
     */
    @Test
    public void testNoFormat() throws Exception {
       ContentHandler handler = new BodyContentHandler();
       Metadata metadata = new Metadata();

       InputStream stream = WordParserTest.class.getResourceAsStream(
               "/test-documents/testWORD_no_format.doc");
       try {
           new OfficeParser().parse(stream, handler, metadata, new ParseContext());
       } finally {
           stream.close();
       }

       String content = handler.toString();
       assertContains("Will generate an exception", content);
    }
View Full Code Here

     * <a href="https://issues.apache.org/jira/browse/TIKA-1130">TIKA-1130</a>.
     */
    @Test
    public void testMissingText() throws Exception {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        ParseContext context = new ParseContext();

        InputStream input = getTestDocument("testWORD_missing_text.docx");
        try {
            parser.parse(input, handler, metadata, context);
            assertEquals(
                    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
                    metadata.get(Metadata.CONTENT_TYPE));
            assertTrue(handler.toString().contains("BigCompany"));
            assertTrue(handler.toString().contains("Seasoned"));
        } finally {
            input.close();
        }
    }
View Full Code Here

     *  fired for all the embedded entries.
     */
    @Test
    public void testEmbedded() throws Exception {
       Parser parser = new AutoDetectParser(); // Should auto-detect!
       ContentHandler handler = new BodyContentHandler();
       Metadata metadata = new Metadata();

       InputStream stream = ZipParserTest.class.getResourceAsStream(
               "/test-documents/test-documents.tbz2");
       try {
View Full Code Here

       InputStream input = WordParserTest.class.getResourceAsStream(
             "/test-documents/testWORD_custom_props.doc");
       Metadata metadata = new Metadata();
      
       try {
          ContentHandler handler = new BodyContentHandler(-1);
          ParseContext context = new ParseContext();
          context.set(Locale.class, Locale.US);
          new OfficeParser().parse(input, handler, metadata, context);
       } finally {
          input.close();
View Full Code Here

    //TIKA-1100:
    @Test
    public void testExcelTextBox() throws Exception {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        ParseContext context = new ParseContext();
        InputStream input = getTestDocument("testEXCEL_textbox.xlsx");
        parser.parse(input, handler, metadata, context);
        String content = handler.toString();
        assertContains("some autoshape", content);   
    }   
View Full Code Here

TOP

Related Classes of org.apache.tika.sax.BodyContentHandler

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.