Examples of org.apache.tika.parser.Parser

org.apache.tika.parser.Parser
Tika parser interface.

    public void testParseGlobalMetadata() throws Exception {
        if(System.getProperty("java.version").startsWith("1.5")) {
            return;
        }


        Parser parser = new NetCDFParser();
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();


        InputStream stream = NetCDFParser.class
                .getResourceAsStream("/test-documents/sresa1b_ncar_ccsm3_0_run1_200001.nc");
        try {
            parser.parse(stream, handler, metadata, new ParseContext());
        } finally {
            stream.close();
        }


        assertEquals(metadata.get(TikaCoreProperties.TITLE),

View Full Code Here

    public static final MediaType TYPE_DOCX = MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document");
    public static final MediaType TYPE_DOC = MediaType.application("msword");


    @Test
    public void testPdfParsing() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        Metadata metadata = new Metadata();


        InputStream stream = PDFParserTest.class.getResourceAsStream(
                "/test-documents/testPDF.pdf");

View Full Code Here

                !content.contains("libraries.Apache"));
    }


    @Test
    public void testCustomMetadata() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        Metadata metadata = new Metadata();


        InputStream stream = PDFParserTest.class.getResourceAsStream(
                "/test-documents/testPDF-custommetadata.pdf");

View Full Code Here

     *  they're encrypted (potentially both text and metadata),
     *  but we can decrypt them easily.
     */
    @Test
    public void testProtectedPDF() throws Exception {
       Parser parser = new AutoDetectParser(); // Should auto-detect!
       ContentHandler handler = new BodyContentHandler();
       Metadata metadata = new Metadata();
       ParseContext context = new ParseContext();


       InputStream stream = PDFParserTest.class.getResourceAsStream(
               "/test-documents/testPDF_protected.pdf");
       try {
           parser.parse(stream, handler, metadata, context);
       } finally {
           stream.close();
       }


       assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
       assertEquals("The Bank of England", metadata.get(TikaCoreProperties.CREATOR));
       assertEquals("The Bank of England", metadata.get(Metadata.AUTHOR));
       assertEquals("Speeches by Andrew G Haldane", metadata.get(OfficeOpenXMLCore.SUBJECT));
       assertEquals("Speeches by Andrew G Haldane", metadata.get(Metadata.SUBJECT));
       assertEquals("Rethinking the Financial Network, Speech by Andrew G Haldane, Executive Director, Financial Stability delivered at the Financial Student Association, Amsterdam on 28 April 2009", metadata.get(TikaCoreProperties.TITLE));


       String content = handler.toString();
       assertTrue(content.contains("RETHINKING THE FINANCIAL NETWORK"));
       assertTrue(content.contains("On 16 November 2002"));
       assertTrue(content.contains("In many important respects"));
       
       
       // Try again with an explicit empty password
       handler = new BodyContentHandler();
       metadata = new Metadata();
       
       context = new ParseContext();
       context.set(PasswordProvider.class, new PasswordProvider() {
           public String getPassword(Metadata metadata) {
              return "";
          }
       });
       
       stream = PDFParserTest.class.getResourceAsStream(
                  "/test-documents/testPDF_protected.pdf");
       try {
          parser.parse(stream, handler, metadata, context);
       } finally {
          stream.close();
       }


       assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));

View Full Code Here

       assertTrue(content.contains("In many important respects"));
    }


    @Test
    public void testTwoTextBoxes() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        InputStream stream = PDFParserTest.class.getResourceAsStream(
                "/test-documents/testPDFTwoTextBoxes.pdf");
        String content = getText(stream, parser);
        content = content.replaceAll("\\s+"," ");
        assertTrue(content.contains("Left column line 1 Left column line 2 Right column line 1 Right column line 2"));

View Full Code Here

        assertTrue(content.contains("Left column line 1 Left column line 2 Right column line 1 Right column line 2"));
    }


    @Test
    public void testVarious() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        Metadata metadata = new Metadata();
        InputStream stream = PDFParserTest.class.getResourceAsStream(
                "/test-documents/testPDFVarious.pdf");


        String content = getText(stream, parser, metadata);

View Full Code Here

        //assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content);
    }


    @Test
    public void testAnnotations() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        InputStream stream = getResourceAsStream("/test-documents/testAnnotations.pdf");
        String content = getText(stream, parser);
        content = content.replaceAll("[\\s\u00a0]+"," ");
        assertContains("Here is some text", content);
        assertContains("Here is a comment", content);

View Full Code Here

    }


    // TIKA-981
    @Test
    public void testPopupAnnotation() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        InputStream stream = getResourceAsStream("/test-documents/testPopupAnnotation.pdf");
        String content = getText(stream, parser);
        assertContains("this is the note", content);
        assertContains("igalsh", content);
    }

View Full Code Here


        // Text has extra spaces when autoSpace is on
        assertEquals(-1, content.indexOf("Here is some formatted text"));
        
        //now try with autodetect
        Parser autoParser = new AutoDetectParser();
        ParseContext context = new ParseContext();
        PDFParserConfig config = new PDFParserConfig();
        context.set(PDFParserConfig.class, config);
        //default is true
        stream = getResourceAsStream("/test-documents/testExtraSpaces.pdf");

View Full Code Here

        content = getText(stream, parser);
        // "Text the first" was dedup'd:
        assertContains("Text the first timesecond time", content);
        
        //now try with autodetect
        Parser autoParser = new AutoDetectParser();
        ParseContext context = new ParseContext();
        PDFParserConfig config = new PDFParserConfig();
        context.set(PDFParserConfig.class, config);
        stream = getResourceAsStream("/test-documents/testOverlappingText.pdf");
        // Default is false (keep overlapping text):

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.tika.parser.Parser

bixo.parser.TikaCallableTest

com.baasbox.controllers.File

com.cloudera.cdk.morphline.solrcell.SolrCellBuilder$SolrCell

com.digitalpebble.behemoth.tika.TikaProcessor

com.findwise.hydra.stage.tika.SimpleFetchingTikaStageTest

com.ikanow.infinit.e.harvest.extraction.document.file.FileHarvester

com.ikanow.infinit.e.harvest.extraction.text.legacy.TextExtractorTika

com.nidhinova.tika.server.TikaService

com.tamingtext.tika.TikaTest

ipcress.Extraction

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.