Package org.apache.tika.parser

Examples of org.apache.tika.parser.Parser


     * The from isn't in the usual form.
     * See TIKA-618
     */
    @Test
    public void testUnusualFromAddress() throws Exception {
       Parser parser = new RFC822Parser();
       Metadata metadata = new Metadata();
       InputStream stream = getStream("test-documents/testRFC822_oddfrom");
       ContentHandler handler = mock(DefaultHandler.class);

       parser.parse(stream, handler, metadata, new ParseContext());
       assertEquals("Saved by Windows Internet Explorer 7",
               metadata.get(TikaCoreProperties.CREATOR));
       assertEquals("Air Permit Programs | Air & Radiation | US EPA",
               metadata.get(TikaCoreProperties.TITLE));
       assertEquals("Air Permit Programs | Air & Radiation | US EPA",
View Full Code Here


        assertEquals(2, content.split("<\\/body>").length);
    }

    @Test
    public void testOutlookForwarded() throws Exception {
        Parser parser = new AutoDetectParser();
        Metadata metadata = new Metadata();
      
        // Check the HTML version
        StringWriter sw = new StringWriter();
        SAXTransformerFactory factory = (SAXTransformerFactory)
                 SAXTransformerFactory.newInstance();
        TransformerHandler handler = factory.newTransformerHandler();
        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
        handler.setResult(new StreamResult(sw));

        InputStream stream = OutlookParserTest.class.getResourceAsStream(
               "/test-documents/testMSG_forwarded.msg");
        try {
           parser.parse(stream, handler, metadata, new ParseContext());
        } finally {
           stream.close();
        }
        
        // Make sure we don't have nested docs
View Full Code Here

                    "really really really really really really long name ");
        }
        String name = inputBuilder.toString();
        byte[] data = ("From: " + name + "\r\n\r\n").getBytes("US-ASCII");

        Parser parser = new RFC822Parser();
        ContentHandler handler = new DefaultHandler();
        Metadata metadata = new Metadata();
        ParseContext context = new ParseContext();

        try {
            parser.parse(
                    new ByteArrayInputStream(data), handler, metadata, context);
            fail();
        } catch (TikaException expected) {
        }

        MimeConfig config = new MimeConfig();
        config.setMaxHeaderLen(-1);
        config.setMaxLineLen(-1);
        context.set(MimeConfig.class, config);
        parser.parse(
                new ByteArrayInputStream(data), handler, metadata, context);
        assertEquals(name.trim(), metadata.get(TikaCoreProperties.CREATOR));
    }
View Full Code Here

        assertEquals(2, content.split("<\\/body>").length);
    }
   
    @Test
    public void testOutlookHTMLfromRTF() throws Exception {
        Parser parser = new AutoDetectParser();
        Metadata metadata = new Metadata();
      
        // Check the HTML version
        StringWriter sw = new StringWriter();
        SAXTransformerFactory factory = (SAXTransformerFactory)
                 SAXTransformerFactory.newInstance();
        TransformerHandler handler = factory.newTransformerHandler();
        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
        handler.setResult(new StreamResult(sw));

        InputStream stream = OutlookParserTest.class.getResourceAsStream(
                "/test-documents/test-outlook2003.msg");
        try {
           parser.parse(stream, handler, metadata, new ParseContext());
        } finally {
           stream.close();
        }
        
        // As the HTML version should have been processed, ensure
View Full Code Here

    /**
     * Test for TIKA-678 - not all headers may be present
     */
    @Test
    public void testSomeMissingHeaders() throws Exception {
       Parser parser = new RFC822Parser();
       Metadata metadata = new Metadata();
       InputStream stream = getStream("test-documents/testRFC822-limitedheaders");
       ContentHandler handler = new BodyContentHandler();

       parser.parse(stream, handler, metadata, new ParseContext());
       assertEquals(true, metadata.isMultiValued(TikaCoreProperties.CREATOR));
       assertEquals("xyz", metadata.getValues(TikaCoreProperties.CREATOR)[0]);
       assertEquals("abc", metadata.getValues(TikaCoreProperties.CREATOR)[1]);
       assertEquals(true, metadata.isMultiValued(Metadata.MESSAGE_FROM));
       assertEquals("xyz", metadata.getValues(Metadata.MESSAGE_FROM)[0]);
View Full Code Here

public class MboxParserTest {

    @Test
    public void testSimple() {
        Parser parser = new MboxParser();
        Metadata metadata = new Metadata();
        InputStream stream = getStream("test-documents/simple.mbox");
        ContentHandler handler = mock(DefaultHandler.class);

        try {
            parser.parse(stream, handler, metadata, new ParseContext());
            verify(handler).startDocument();
            verify(handler, times(2)).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class));
            verify(handler, times(2)).endElement(XHTMLContentHandler.XHTML, "p", "p");
            verify(handler).characters(new String("Test content 1").toCharArray(), 0, 14);
            verify(handler).characters(new String("Test content 2").toCharArray(), 0, 14);
View Full Code Here

        }
    }

    @Test
    public void testHeaders() {
        Parser parser = new MboxParser();
        Metadata metadata = new Metadata();
        InputStream stream = getStream("test-documents/headers.mbox");
        ContentHandler handler = mock(DefaultHandler.class);

        try {
            parser.parse(stream, handler, metadata, new ParseContext());

            verify(handler).startDocument();
            verify(handler).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class));
            verify(handler).characters(new String("Test content").toCharArray(), 0, 12);
            verify(handler).endDocument();
View Full Code Here

        }
    }

    @Test
    public void testMultilineHeader() {
        Parser parser = new MboxParser();
        Metadata metadata = new Metadata();
        InputStream stream = getStream("test-documents/multiline.mbox");
        ContentHandler handler = mock(DefaultHandler.class);

        try {
            parser.parse(stream, handler, metadata, new ParseContext());

            verify(handler).startDocument();
            verify(handler).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class));
            verify(handler).characters(new String("Test content").toCharArray(), 0, 12);
            verify(handler).endDocument();
View Full Code Here

        }
    }

    @Test
    public void testQuoted() {
        Parser parser = new MboxParser();
        Metadata metadata = new Metadata();
        InputStream stream = getStream("test-documents/quoted.mbox");
        ContentHandler handler = mock(DefaultHandler.class);

        try {
            parser.parse(stream, handler, metadata, new ParseContext());

            verify(handler).startDocument();
            verify(handler).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class));
            verify(handler).startElement(eq(XHTMLContentHandler.XHTML), eq("q"), eq("q"), any(Attributes.class));
            verify(handler).endElement(eq(XHTMLContentHandler.XHTML), eq("q"), eq("q"));
View Full Code Here

        }
    }

    @Test
    public void testComplex() {
        Parser parser = new MboxParser();
        Metadata metadata = new Metadata();
        InputStream stream = getStream("test-documents/complex.mbox");
        ContentHandler handler = mock(DefaultHandler.class);

        try {
            parser.parse(stream, handler, metadata, new ParseContext());

            // TODO: Remove subject and author in Tika 2.0
            assertEquals("Re: question about when shuffle/sort start working", metadata.get(Metadata.SUBJECT));
            assertEquals("Re: question about when shuffle/sort start working", metadata.get(TikaCoreProperties.TITLE));
            assertEquals("Jothi Padmanabhan <jothipn@yahoo-inc.com>", metadata.get(Metadata.AUTHOR));
View Full Code Here

TOP

Related Classes of org.apache.tika.parser.Parser

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.