Package org.apache.tika.parser

Examples of org.apache.tika.parser.Parser


    public void testParseGlobalMetadata() throws Exception {
        if(System.getProperty("java.version").startsWith("1.5")) {
            return;
        }

        Parser parser = new NetCDFParser();
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();

        InputStream stream = NetCDFParser.class
                .getResourceAsStream("/test-documents/sresa1b_ncar_ccsm3_0_run1_200001.nc");
        try {
            parser.parse(stream, handler, metadata, new ParseContext());
        } finally {
            stream.close();
        }

        assertEquals(metadata.get(TikaCoreProperties.TITLE),
View Full Code Here


    public static final MediaType TYPE_DOCX = MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document");
    public static final MediaType TYPE_DOC = MediaType.application("msword");

    @Test
    public void testPdfParsing() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        Metadata metadata = new Metadata();

        InputStream stream = PDFParserTest.class.getResourceAsStream(
                "/test-documents/testPDF.pdf");
View Full Code Here

                !content.contains("libraries.Apache"));
    }

    @Test
    public void testCustomMetadata() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        Metadata metadata = new Metadata();

        InputStream stream = PDFParserTest.class.getResourceAsStream(
                "/test-documents/testPDF-custommetadata.pdf");
View Full Code Here

     *  they're encrypted (potentially both text and metadata),
     *  but we can decrypt them easily.
     */
    @Test
    public void testProtectedPDF() throws Exception {
       Parser parser = new AutoDetectParser(); // Should auto-detect!
       ContentHandler handler = new BodyContentHandler();
       Metadata metadata = new Metadata();
       ParseContext context = new ParseContext();

       InputStream stream = PDFParserTest.class.getResourceAsStream(
               "/test-documents/testPDF_protected.pdf");
       try {
           parser.parse(stream, handler, metadata, context);
       } finally {
           stream.close();
       }

       assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
       assertEquals("The Bank of England", metadata.get(TikaCoreProperties.CREATOR));
       assertEquals("The Bank of England", metadata.get(Metadata.AUTHOR));
       assertEquals("Speeches by Andrew G Haldane", metadata.get(OfficeOpenXMLCore.SUBJECT));
       assertEquals("Speeches by Andrew G Haldane", metadata.get(Metadata.SUBJECT));
       assertEquals("Rethinking the Financial Network, Speech by Andrew G Haldane, Executive Director, Financial Stability delivered at the Financial Student Association, Amsterdam on 28 April 2009", metadata.get(TikaCoreProperties.TITLE));

       String content = handler.toString();
       assertTrue(content.contains("RETHINKING THE FINANCIAL NETWORK"));
       assertTrue(content.contains("On 16 November 2002"));
       assertTrue(content.contains("In many important respects"));
      
      
       // Try again with an explicit empty password
       handler = new BodyContentHandler();
       metadata = new Metadata();
      
       context = new ParseContext();
       context.set(PasswordProvider.class, new PasswordProvider() {
           public String getPassword(Metadata metadata) {
              return "";
          }
       });
      
       stream = PDFParserTest.class.getResourceAsStream(
                  "/test-documents/testPDF_protected.pdf");
       try {
          parser.parse(stream, handler, metadata, context);
       } finally {
          stream.close();
       }

       assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
View Full Code Here

       assertTrue(content.contains("In many important respects"));
    }

    @Test
    public void testTwoTextBoxes() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        InputStream stream = PDFParserTest.class.getResourceAsStream(
                "/test-documents/testPDFTwoTextBoxes.pdf");
        String content = getText(stream, parser);
        content = content.replaceAll("\\s+"," ");
        assertTrue(content.contains("Left column line 1 Left column line 2 Right column line 1 Right column line 2"));
View Full Code Here

        assertTrue(content.contains("Left column line 1 Left column line 2 Right column line 1 Right column line 2"));
    }

    @Test
    public void testVarious() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        Metadata metadata = new Metadata();
        InputStream stream = PDFParserTest.class.getResourceAsStream(
                "/test-documents/testPDFVarious.pdf");

        String content = getText(stream, parser, metadata);
View Full Code Here

        //assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content);
    }

    @Test
    public void testAnnotations() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        InputStream stream = getResourceAsStream("/test-documents/testAnnotations.pdf");
        String content = getText(stream, parser);
        content = content.replaceAll("[\\s\u00a0]+"," ");
        assertContains("Here is some text", content);
        assertContains("Here is a comment", content);
View Full Code Here

    }

    // TIKA-981
    @Test
    public void testPopupAnnotation() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        InputStream stream = getResourceAsStream("/test-documents/testPopupAnnotation.pdf");
        String content = getText(stream, parser);
        assertContains("this is the note", content);
        assertContains("igalsh", content);
    }
View Full Code Here

        // Text has extra spaces when autoSpace is on
        assertEquals(-1, content.indexOf("Here is some formatted text"));
       
        //now try with autodetect
        Parser autoParser = new AutoDetectParser();
        ParseContext context = new ParseContext();
        PDFParserConfig config = new PDFParserConfig();
        context.set(PDFParserConfig.class, config);
        //default is true
        stream = getResourceAsStream("/test-documents/testExtraSpaces.pdf");
View Full Code Here

        content = getText(stream, parser);
        // "Text the first" was dedup'd:
        assertContains("Text the first timesecond time", content);
       
        //now try with autodetect
        Parser autoParser = new AutoDetectParser();
        ParseContext context = new ParseContext();
        PDFParserConfig config = new PDFParserConfig();
        context.set(PDFParserConfig.class, config);
        stream = getResourceAsStream("/test-documents/testOverlappingText.pdf");
        // Default is false (keep overlapping text):
View Full Code Here

TOP

Related Classes of org.apache.tika.parser.Parser

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.