Package org.apache.tika.parser

Examples of org.apache.tika.parser.Parser


*/
public class OutlookParserTest {

    @Test
    public void testOutlookParsing() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();

        InputStream stream = OutlookParserTest.class.getResourceAsStream(
                "/test-documents/test-outlook.msg");
        try {
            parser.parse(stream, handler, metadata, new ParseContext());
        } finally {
            stream.close();
        }

        assertEquals(
View Full Code Here


public class RFC822ParserTest {

    @Test
    public void testSimple() {
        Parser parser = new RFC822Parser();
        Metadata metadata = new Metadata();
        InputStream stream = getStream("test-documents/testRFC822");
        ContentHandler handler = mock(DefaultHandler.class);

        try {
            parser.parse(stream, handler, metadata, new ParseContext());
            verify(handler).startDocument();
            //just one body
            verify(handler).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class));
            verify(handler).endElement(XHTMLContentHandler.XHTML, "p", "p");
            //no multi-part body parts
View Full Code Here

        }
    }

    @Test
    public void testMultipart() {
        Parser parser = new RFC822Parser();
        Metadata metadata = new Metadata();
        InputStream stream = getStream("test-documents/testRFC822-multipart");
        ContentHandler handler = mock(XHTMLContentHandler.class);

        try {
            parser.parse(stream, handler, metadata, new ParseContext());
            verify(handler).startDocument();
            //4 body-part divs -- two outer bodies and two inner bodies
            verify(handler, times(4)).startElement(eq(XHTMLContentHandler.XHTML), eq("div"), eq("div"), any(Attributes.class));
            verify(handler, times(4)).endElement(XHTMLContentHandler.XHTML, "div", "div");
            //5 paragraph elements, 4 for body-parts and 1 for encompassing message
            verify(handler, times(5)).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class));
            verify(handler, times(5)).endElement(XHTMLContentHandler.XHTML, "p", "p");
            verify(handler).endDocument();
        } catch (Exception e) {
            fail("Exception thrown: " + e.getMessage());
        }
       
        //repeat, this time looking at content
        parser = new RFC822Parser();
        metadata = new Metadata();
        stream = getStream("test-documents/testRFC822-multipart");
        handler = new BodyContentHandler();
        try {
            parser.parse(stream, handler, metadata, new ParseContext());
            //tests correct decoding of quoted printable text, including UTF-8 bytes into Unicode
            String bodyText = handler.toString();
            assertTrue(bodyText.contains("body 1"));
            assertTrue(bodyText.contains("body 2"));
            assertFalse(bodyText.contains("R0lGODlhNgE8AMQAA")); //part of encoded gif
View Full Code Here

     *
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-197">TIKA-197</a>
     */
    @Test
    public void testMultipleCopies() throws Exception {
        Parser parser = new AutoDetectParser();
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();

        InputStream stream = OutlookParserTest.class.getResourceAsStream(
                "/test-documents/testMSG.msg");
        try {
            parser.parse(stream, handler, metadata, new ParseContext());
        } finally {
            stream.close();
        }

        assertEquals(
View Full Code Here

     *  properly reported
     */
    @Test
    public void testParsingErrorInForkedParserShouldBeReported() throws Exception {
        BrokenParser brokenParser = new BrokenParser();
        Parser parser = new ForkParser(ForkParser.class.getClassLoader(), brokenParser);
        InputStream stream = getClass().getResourceAsStream("/test-documents/testTXT.txt");
       
        // With a serializable error, we'll get that back
        try {
            ContentHandler output = new BodyContentHandler();
            ParseContext context = new ParseContext();
            parser.parse(stream, output, new Metadata(), context);
            fail("Expected TikaException caused by Error");
        } catch (TikaException e) {
            assertEquals(brokenParser.err, e.getCause());
        }
       
View Full Code Here

     *
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-395">TIKA-395</a>
     */
    @Test
    public void testOutlookNew() throws Exception {
        Parser parser = new AutoDetectParser();
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();

        InputStream stream = OutlookParserTest.class.getResourceAsStream(
                "/test-documents/test-outlook2003.msg");
        try {
            parser.parse(stream, handler, metadata, new ParseContext());
        } finally {
            stream.close();
        }

        assertEquals(
View Full Code Here

        }
    }

    @Test
    public void testQuotedPrintable() {
        Parser parser = new RFC822Parser();
        Metadata metadata = new Metadata();
        InputStream stream = getStream("test-documents/testRFC822_quoted");
        ContentHandler handler = new BodyContentHandler();

        try {
            parser.parse(stream, handler, metadata, new ParseContext());
            //tests correct decoding of quoted printable text, including UTF-8 bytes into Unicode
            String bodyText = handler.toString();
            assertTrue(bodyText.contains("D\u00FCsseldorf has non-ascii."));
            assertTrue(bodyText.contains("Lines can be split like this."));
            assertTrue(bodyText.contains("Spaces at the end of a line \r\nmust be encoded.\r\n"));
View Full Code Here

        }
    }

    @Test
    public void testBase64() {
        Parser parser = new RFC822Parser();
        Metadata metadata = new Metadata();
        InputStream stream = getStream("test-documents/testRFC822_base64");
        ContentHandler handler = new BodyContentHandler();

        try {
            parser.parse(stream, handler, metadata, new ParseContext());
            //tests correct decoding of base64 text, including ISO-8859-1 bytes into Unicode
            assertTrue(handler.toString().contains("Here is some text, with international characters, voil\u00E0!"));
        } catch (Exception e) {
            fail("Exception thrown: " + e.getMessage());
        }
View Full Code Here

        assertTrue(content.contains("Navigation Pane"));
    }
    
    @Test
    public void testOutlookHTMLVersion() throws Exception {
        Parser parser = new AutoDetectParser();
        Metadata metadata = new Metadata();
      
        // Check the HTML version
        StringWriter sw = new StringWriter();
        SAXTransformerFactory factory = (SAXTransformerFactory)
                 SAXTransformerFactory.newInstance();
        TransformerHandler handler = factory.newTransformerHandler();
        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
        handler.setResult(new StreamResult(sw));

        InputStream stream = OutlookParserTest.class.getResourceAsStream(
               "/test-documents/testMSG_chinese.msg");
        try {
           parser.parse(stream, handler, metadata, new ParseContext());
        } finally {
           stream.close();
        }
        
        // As the HTML version should have been processed, ensure
View Full Code Here

        }
    }

    @Test
    public void testI18NHeaders() {
        Parser parser = new RFC822Parser();
        Metadata metadata = new Metadata();
        InputStream stream = getStream("test-documents/testRFC822_i18nheaders");
        ContentHandler handler = mock(DefaultHandler.class);

        try {
            parser.parse(stream, handler, metadata, new ParseContext());
            //tests correct decoding of internationalized headers, both
            //quoted-printable (Q) and Base64 (B).
            assertEquals("Keld J\u00F8rn Simonsen <keld@dkuug.dk>",
                    metadata.get(TikaCoreProperties.CREATOR));
            assertEquals("If you can read this you understand the example.",
View Full Code Here

TOP

Related Classes of org.apache.tika.parser.Parser

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.