Package org.apache.tika.parser

Examples of org.apache.tika.parser.AutoDetectParser


        assertTrue(content.contains("Left column line 1 Left column line 2 Right column line 1 Right column line 2"));
    }

    @Test
    public void testVarious() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        Metadata metadata = new Metadata();
        InputStream stream = PDFParserTest.class.getResourceAsStream(
                "/test-documents/testPDFVarious.pdf");

        String content = getText(stream, parser, metadata);
View Full Code Here


        //assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content);
    }

    @Test
    public void testAnnotations() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        InputStream stream = getResourceAsStream("/test-documents/testAnnotations.pdf");
        String content = getText(stream, parser);
        content = content.replaceAll("[\\s\u00a0]+"," ");
        assertContains("Here is some text", content);
        assertContains("Here is a comment", content);
View Full Code Here

    }

    // TIKA-981
    @Test
    public void testPopupAnnotation() throws Exception {
        Parser parser = new AutoDetectParser(); // Should auto-detect!
        InputStream stream = getResourceAsStream("/test-documents/testPopupAnnotation.pdf");
        String content = getText(stream, parser);
        assertContains("this is the note", content);
        assertContains("igalsh", content);
    }
View Full Code Here

        // Text has extra spaces when autoSpace is on
        assertEquals(-1, content.indexOf("Here is some formatted text"));
       
        //now try with autodetect
        Parser autoParser = new AutoDetectParser();
        ParseContext context = new ParseContext();
        PDFParserConfig config = new PDFParserConfig();
        context.set(PDFParserConfig.class, config);
        //default is true
        stream = getResourceAsStream("/test-documents/testExtraSpaces.pdf");
View Full Code Here

        content = getText(stream, parser);
        // "Text the first" was dedup'd:
        assertContains("Text the first timesecond time", content);
       
        //now try with autodetect
        Parser autoParser = new AutoDetectParser();
        ParseContext context = new ParseContext();
        PDFParserConfig config = new PDFParserConfig();
        context.set(PDFParserConfig.class, config);
        stream = getResourceAsStream("/test-documents/testOverlappingText.pdf");
        // Default is false (keep overlapping text):
View Full Code Here

        content = content.replaceAll("\\s+", " ");
        // Column text is now interleaved:
        assertContains("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2", content);
       
        //now try setting autodetect via parsecontext       
        AutoDetectParser autoParser = new AutoDetectParser();
        ParseContext context = new ParseContext();
        PDFParserConfig config = new PDFParserConfig();
        context.set(PDFParserConfig.class, config);
        stream = getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
        // Default is false (do not sort):
View Full Code Here

       /* format of test doc:
         docx/
            pdf/
               docx
       */
       Parser parser = new AutoDetectParser(); // Should auto-detect!
       ContentHandler handler = new BodyContentHandler();
       Metadata metadata = new Metadata();
       ParseContext context = new ParseContext();
       String content = "";
       InputStream stream = null;
       try{
          context.set(org.apache.tika.parser.Parser.class, parser);
          stream = getResourceAsStream("/test-documents/testPDFEmbeddingAndEmbedded.docx");
          parser.parse(stream, handler, metadata, context);
          content = handler.toString();
       } finally {
          stream.close();
       }
       int outerHaystack = content.indexOf("Outer_haystack");
View Full Code Here

     *
     * TODO: more testing
     */
    @Test
    public void testSequentialParser() throws Exception{
        Parser defaultParser = new AutoDetectParser();
        Parser sequentialParser = new AutoDetectParser();
        ParseContext context = new ParseContext();
        PDFParserConfig config = new PDFParserConfig();
        config.setUseNonSequentialParser(true);
        context.set(PDFParserConfig.class, config);

View Full Code Here

    return produceMetadata(is, httpHeaders.getRequestHeaders(), info);
  }
 
  private StreamingOutput produceMetadata(InputStream is, MultivaluedMap<String, String> httpHeaders, UriInfo info) throws Exception {
    final Metadata metadata = new Metadata();
    AutoDetectParser parser = TikaResource.createParser();
    TikaResource.fillMetadata(parser, metadata, httpHeaders);
    TikaResource.logRequest(logger, info, metadata);

    parser.parse(is, new DefaultHandler(), metadata);

    return new StreamingOutput() {
      public void write(OutputStream outputStream) throws IOException, WebApplicationException {
        metadataToCsv(metadata, outputStream);
      }
View Full Code Here

        InputStream stream =
            JackrabbitParser.class.getResourceAsStream("tika-config.xml");
        try {
            if (stream != null) {
                try {
                    parser = new AutoDetectParser(new TikaConfig(stream));
                } finally {
                    stream.close();
                }
            } else {
                parser = new AutoDetectParser();
            }
        } catch (Exception e) {
            // Should never happen
            throw new RuntimeException(
                    "Unable to load embedded Tika configuration", e);
View Full Code Here

TOP

Related Classes of org.apache.tika.parser.AutoDetectParser

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.