Package org.apache.tika.fork

Examples of org.apache.tika.fork.ForkParser


        if (config == null) {
            config = TikaConfig.getDefaultConfig();
        }

        if (forkJavaCommand != null) {
            ForkParser forkParser = new ForkParser(
                    SearchIndex.class.getClassLoader(),
                    new AutoDetectParser(config));
            forkParser.setJavaCommand(forkJavaCommand);
            forkParser.setPoolSize(extractorPoolSize);
            return forkParser;
        } else {
            return new AutoDetectParser(config);
        }
    }
View Full Code Here


        public void process(
                InputStream input, OutputStream output, Metadata metadata)
                throws Exception {
            Parser p = parser;
            if (fork) {
                p = new ForkParser(TikaCLI.class.getClassLoader(), p);
            }
            ContentHandler handler = getContentHandler(output, metadata);
            p.parse(input, handler, metadata, context);
            // fix for TIKA-596: if a parser doesn't generate
            // XHTML output, the lack of an output document prevents
View Full Code Here

    /**
     * Simple text parsing
     */
    @Test
    public void testForkedTextParsing() throws Exception {
        ForkParser parser = new ForkParser(
                ForkParserIntegrationTest.class.getClassLoader(),
                tika.getParser());

       try {
          ContentHandler output = new BodyContentHandler();
          InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream(
                  "/test-documents/testTXT.txt");
          ParseContext context = new ParseContext();
          parser.parse(stream, output, new Metadata(), context);

          String content = output.toString();
          assertTrue(content.contains("Test d'indexation"));
          assertTrue(content.contains("http://www.apache.org"));
       } finally {
          parser.close();
       }
    }
View Full Code Here

     *  properly reported
     */
    @Test
    public void testParsingErrorInForkedParserShouldBeReported() throws Exception {
        BrokenParser brokenParser = new BrokenParser();
        Parser parser = new ForkParser(ForkParser.class.getClassLoader(), brokenParser);
        InputStream stream = getClass().getResourceAsStream("/test-documents/testTXT.txt");
       
        // With a serializable error, we'll get that back
        try {
            ContentHandler output = new BodyContentHandler();
            ParseContext context = new ParseContext();
            parser.parse(stream, output, new Metadata(), context);
            fail("Expected TikaException caused by Error");
        } catch (TikaException e) {
            assertEquals(brokenParser.err, e.getCause());
        }
       
        // With a non serializable one, we'll get something else
        // TODO Fix this test
        brokenParser = new BrokenParser();
        brokenParser.re= new WontBeSerializedError("Can't Serialize");
        parser = new ForkParser(ForkParser.class.getClassLoader(), brokenParser);
//        try {
//           ContentHandler output = new BodyContentHandler();
//           ParseContext context = new ParseContext();
//           parser.parse(stream, output, new Metadata(), context);
//           fail("Expected TikaException caused by Error");
View Full Code Here

     * If we supply a non serializable object on the ParseContext,
     *  check we get a helpful exception back
     */
    @Test
    public void testParserHandlingOfNonSerializable() throws Exception {
       ForkParser parser = new ForkParser(
             ForkParserIntegrationTest.class.getClassLoader(),
             tika.getParser());
      
       ParseContext context = new ParseContext();
       context.set(Detector.class, new Detector() {
          public MediaType detect(InputStream input, Metadata metadata) {
             return MediaType.OCTET_STREAM;
          }
       });

       try {
          ContentHandler output = new BodyContentHandler();
          InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream(
              "/test-documents/testTXT.txt");
          parser.parse(stream, output, new Metadata(), context);
          fail("Should have blown up with a non serializable ParseContext");
       } catch(TikaException e) {
          // Check the right details
          assertNotNull(e.getCause());
          assertEquals(NotSerializableException.class, e.getCause().getClass());
          assertEquals("Unable to serialize ParseContext to pass to the Forked Parser", e.getMessage());
       } finally {
          parser.close();
       }
    }
View Full Code Here

    public void testAttachingADebuggerOnTheForkedParserShouldWork()
            throws Exception {
        ParseContext context = new ParseContext();
        context.set(Parser.class, tika.getParser());

        ForkParser parser = new ForkParser(
                ForkParserIntegrationTest.class.getClassLoader(),
                tika.getParser());
        parser.setJavaCommand(
                "java -Xmx32m -Xdebug -Xrunjdwp:"
                + "transport=dt_socket,address=54321,server=y,suspend=n");
        try {
            ContentHandler body = new BodyContentHandler();
            InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream(
                    "/test-documents/testTXT.txt");
            parser.parse(stream, body, new Metadata(), context);
            String content = body.toString();
            assertTrue(content.contains("Test d'indexation"));
            assertTrue(content.contains("http://www.apache.org"));
        } finally {
            parser.close();
        }
    }
View Full Code Here

     * TIKA-808 - Ensure that parsing of our test PDFs work under
     * the Fork Parser, to ensure that complex parsing behaves
     */
    @Test
    public void testForkedPDFParsing() throws Exception {
        ForkParser parser = new ForkParser(
                ForkParserIntegrationTest.class.getClassLoader(),
                tika.getParser());
        try {
            ContentHandler output = new BodyContentHandler();
            InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream(
                    "/test-documents/testPDF.pdf");
            ParseContext context = new ParseContext();
            parser.parse(stream, output, new Metadata(), context);

            String content = output.toString();
            assertTrue(content.contains("Apache Tika"));
            assertTrue(content.contains("Tika - Content Analysis Toolkit"));
            assertTrue(content.contains("incubator"));
            assertTrue(content.contains("Apache Software Foundation"));
        } finally {
            parser.close();
        }
    }
View Full Code Here

        if (config == null) {
            config = TikaConfig.getDefaultConfig();
        }

        if (forkJavaCommand != null) {
            ForkParser forkParser = new ForkParser(
                    SearchIndex.class.getClassLoader(),
                    new AutoDetectParser(config));
            forkParser.setJavaCommand(forkJavaCommand);
            forkParser.setPoolSize(extractorPoolSize);
            return forkParser;
        } else {
            return new AutoDetectParser(config);
        }
    }
View Full Code Here

        if (config == null) {
            config = TikaConfig.getDefaultConfig();
        }

        if (forkJavaCommand != null) {
            ForkParser forkParser = new ForkParser(
                    SearchIndex.class.getClassLoader(),
                    new AutoDetectParser(config));
            forkParser.setJavaCommand(forkJavaCommand);
            forkParser.setPoolSize(extractorPoolSize);
            return forkParser;
        } else {
            return new AutoDetectParser(config);
        }
    }
View Full Code Here

        public void process(
                InputStream input, OutputStream output, Metadata metadata)
                throws Exception {
            Parser p = parser;
            if (fork) {
                p = new ForkParser(TikaCLI.class.getClassLoader(), p);
            }
            ContentHandler handler = getContentHandler(output, metadata);
            p.parse(input, handler, metadata, context);
            // fix for TIKA-596: if a parser doesn't generate
            // XHTML output, the lack of an output document prevents
View Full Code Here

TOP

Related Classes of org.apache.tika.fork.ForkParser

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.