Package org.apache.tika.parser

Examples of org.apache.tika.parser.AutoDetectParser


*/
public class OutlookParserTest {

    @Test
    public void testOutlookParsing() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();

        InputStream stream = OutlookParserTest.class.getResourceAsStream(
                "/test-documents/test-outlook.msg");
        try {
            parser.parse(stream, handler, metadata, new ParseContext());
        } finally {
            stream.close();
        }

        assertEquals(
View Full Code Here


     *
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-197">TIKA-197</a>
     */
    @Test
    public void testMultipleCopies() throws Exception {
        Parser parser = new AutoDetectParser();
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();

        InputStream stream = OutlookParserTest.class.getResourceAsStream(
                "/test-documents/testMSG.msg");
        try {
            parser.parse(stream, handler, metadata, new ParseContext());
        } finally {
            stream.close();
        }

        assertEquals(
View Full Code Here

     *
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-395">TIKA-395</a>
     */
    @Test
    public void testOutlookNew() throws Exception {
        Parser parser = new AutoDetectParser();
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();

        InputStream stream = OutlookParserTest.class.getResourceAsStream(
                "/test-documents/test-outlook2003.msg");
        try {
            parser.parse(stream, handler, metadata, new ParseContext());
        } finally {
            stream.close();
        }

        assertEquals(
View Full Code Here

        assertTrue(content.contains("Navigation Pane"));
    }
    
    @Test
    public void testOutlookHTMLVersion() throws Exception {
        Parser parser = new AutoDetectParser();
        Metadata metadata = new Metadata();
      
        // Check the HTML version
        StringWriter sw = new StringWriter();
        SAXTransformerFactory factory = (SAXTransformerFactory)
                 SAXTransformerFactory.newInstance();
        TransformerHandler handler = factory.newTransformerHandler();
        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
        handler.setResult(new StreamResult(sw));

        InputStream stream = OutlookParserTest.class.getResourceAsStream(
               "/test-documents/testMSG_chinese.msg");
        try {
           parser.parse(stream, handler, metadata, new ParseContext());
        } finally {
           stream.close();
        }
        
        // As the HTML version should have been processed, ensure
View Full Code Here

        assertEquals(2, content.split("<\\/body>").length);
    }

    @Test
    public void testOutlookForwarded() throws Exception {
        Parser parser = new AutoDetectParser();
        Metadata metadata = new Metadata();
      
        // Check the HTML version
        StringWriter sw = new StringWriter();
        SAXTransformerFactory factory = (SAXTransformerFactory)
                 SAXTransformerFactory.newInstance();
        TransformerHandler handler = factory.newTransformerHandler();
        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
        handler.setResult(new StreamResult(sw));

        InputStream stream = OutlookParserTest.class.getResourceAsStream(
               "/test-documents/testMSG_forwarded.msg");
        try {
           parser.parse(stream, handler, metadata, new ParseContext());
        } finally {
           stream.close();
        }
        
        // Make sure we don't have nested docs
View Full Code Here

        assertEquals(2, content.split("<\\/body>").length);
    }
   
    @Test
    public void testOutlookHTMLfromRTF() throws Exception {
        Parser parser = new AutoDetectParser();
        Metadata metadata = new Metadata();
      
        // Check the HTML version
        StringWriter sw = new StringWriter();
        SAXTransformerFactory factory = (SAXTransformerFactory)
                 SAXTransformerFactory.newInstance();
        TransformerHandler handler = factory.newTransformerHandler();
        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
        handler.setResult(new StreamResult(sw));

        InputStream stream = OutlookParserTest.class.getResourceAsStream(
                "/test-documents/test-outlook2003.msg");
        try {
           parser.parse(stream, handler, metadata, new ParseContext());
        } finally {
           stream.close();
        }
        
        // As the HTML version should have been processed, ensure
View Full Code Here

    public static final MediaType TYPE_DOCX = MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document");
    public static final MediaType TYPE_DOC = MediaType.application("msword");

    @Test
    public void testPdfParsing() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        Metadata metadata = new Metadata();

        InputStream stream = PDFParserTest.class.getResourceAsStream(
                "/test-documents/testPDF.pdf");
View Full Code Here

                !content.contains("libraries.Apache"));
    }

    @Test
    public void testCustomMetadata() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        Metadata metadata = new Metadata();

        InputStream stream = PDFParserTest.class.getResourceAsStream(
                "/test-documents/testPDF-custommetadata.pdf");
View Full Code Here

     *  they're encrypted (potentially both text and metadata),
     *  but we can decrypt them easily.
     */
    @Test
    public void testProtectedPDF() throws Exception {
       Parser parser = new AutoDetectParser(); // Should auto-detect!
       ContentHandler handler = new BodyContentHandler();
       Metadata metadata = new Metadata();
       ParseContext context = new ParseContext();

       InputStream stream = PDFParserTest.class.getResourceAsStream(
               "/test-documents/testPDF_protected.pdf");
       try {
           parser.parse(stream, handler, metadata, context);
       } finally {
           stream.close();
       }

       assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
       assertEquals("The Bank of England", metadata.get(TikaCoreProperties.CREATOR));
       assertEquals("The Bank of England", metadata.get(Metadata.AUTHOR));
       assertEquals("Speeches by Andrew G Haldane", metadata.get(OfficeOpenXMLCore.SUBJECT));
       assertEquals("Speeches by Andrew G Haldane", metadata.get(Metadata.SUBJECT));
       assertEquals("Rethinking the Financial Network, Speech by Andrew G Haldane, Executive Director, Financial Stability delivered at the Financial Student Association, Amsterdam on 28 April 2009", metadata.get(TikaCoreProperties.TITLE));

       String content = handler.toString();
       assertTrue(content.contains("RETHINKING THE FINANCIAL NETWORK"));
       assertTrue(content.contains("On 16 November 2002"));
       assertTrue(content.contains("In many important respects"));
      
      
       // Try again with an explicit empty password
       handler = new BodyContentHandler();
       metadata = new Metadata();
      
       context = new ParseContext();
       context.set(PasswordProvider.class, new PasswordProvider() {
           public String getPassword(Metadata metadata) {
              return "";
          }
       });
      
       stream = PDFParserTest.class.getResourceAsStream(
                  "/test-documents/testPDF_protected.pdf");
       try {
          parser.parse(stream, handler, metadata, context);
       } finally {
          stream.close();
       }

       assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
View Full Code Here

       assertTrue(content.contains("In many important respects"));
    }

    @Test
    public void testTwoTextBoxes() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        InputStream stream = PDFParserTest.class.getResourceAsStream(
                "/test-documents/testPDFTwoTextBoxes.pdf");
        String content = getText(stream, parser);
        content = content.replaceAll("\\s+"," ");
        assertTrue(content.contains("Left column line 1 Left column line 2 Right column line 1 Right column line 2"));
View Full Code Here

TOP

Related Classes of org.apache.tika.parser.AutoDetectParser

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.