Examples of AutoDetectParser


Examples of org.apache.tika.parser.AutoDetectParser

        return Normalizer.normalize(text, Normalizer.Form.NFC);
    }

    private String extractTextWithTika(byte[] textBytes, Metadata metadata) throws TikaException, SAXException, IOException {
        AutoDetectParser parser = new AutoDetectParser(new MimeTypes());
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        OutputStreamWriter writer = new OutputStreamWriter(baos, "UTF-8");
        ContentHandler handler = new BodyContentHandler(writer);
        ParseContext context = new ParseContext();
        context.set(PDFParserConfig.class, new LumifyParserConfig());
        parser.parse(new ByteArrayInputStream(textBytes), handler, metadata, context);
        return IOUtils.toString(baos.toByteArray(), "UTF-8");
    }
View Full Code Here

Examples of org.apache.tika.parser.AutoDetectParser

        _linkExtractor.setLinkAttributeTypes(getParserPolicy().getLinkAttributeTypes());
        _linkExtractor.reset();
    }

    public Parser getTikaParser() {
        return new AutoDetectParser();
    }
View Full Code Here

Examples of org.apache.tika.parser.AutoDetectParser

    //<start id="tika"/>
    InputStream input = new FileInputStream(
            new File("src/test/resources/pdfBox-sample.pdf"));//<co id="tika.is"/>
    ContentHandler textHandler = new BodyContentHandler();//<co id="tika.handler"/>
    Metadata metadata = new Metadata();//<co id="tika.metadata"/>
    Parser parser = new AutoDetectParser();//<co id="tika.parser"/>
    ParseContext context = new ParseContext();
    parser.parse(input, textHandler, metadata, context);//<co id="tika.parse"/>
    System.out.println("Title: " + metadata.get(Metadata.TITLE));//<co id="tika.title"/>
    System.out.println("Body: " + textHandler.toString());//<co id="tika.body"/>
    /*
<calloutlist>
    <callout arearefs="tika.is"><para>Create the <classname>InputStream</classname> to read in the content</para></callout>
View Full Code Here

Examples of org.apache.tika.parser.AutoDetectParser

        }

        if (forkJavaCommand != null) {
            ForkParser forkParser = new ForkParser(
                    SearchIndex.class.getClassLoader(),
                    new AutoDetectParser(config));
            forkParser.setJavaCommand(forkJavaCommand);
            forkParser.setPoolSize(extractorPoolSize);
            return forkParser;
        } else {
            return new AutoDetectParser(config);
        }
    }
View Full Code Here

Examples of org.apache.tika.parser.AutoDetectParser

    private boolean pipeMode = true;

    public TikaCLI() throws TransformerConfigurationException {
        context = new ParseContext();
        parser = new AutoDetectParser();
        context.set(Parser.class, parser);
    }
View Full Code Here

Examples of org.apache.tika.parser.AutoDetectParser

    public void testExcel() throws Exception {
        InputStream input = OOXMLParserTest.class
                .getResourceAsStream("/test-documents/testEXCEL.xlsx");

        Parser parser = new AutoDetectParser();
        Metadata metadata = new Metadata();
        // TODO: should auto-detect without the resource name
        metadata.set(Metadata.RESOURCE_NAME_KEY, "testEXCEL.xlsx");
        ContentHandler handler = new BodyContentHandler();

        try {
            parser.parse(input, handler, metadata);

            assertEquals(
                    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
                    metadata.get(Metadata.CONTENT_TYPE));
            assertEquals("Simple Excel document", metadata.get(Metadata.TITLE));
View Full Code Here

Examples of org.apache.tika.parser.AutoDetectParser

    public void testExcelFormats() throws Exception {
        InputStream input = OOXMLParserTest.class
                .getResourceAsStream("/test-documents/testEXCEL-formats.xlsx");

        Parser parser = new AutoDetectParser();
        Metadata metadata = new Metadata();
        // TODO: should auto-detect without the resource name
        metadata.set(Metadata.RESOURCE_NAME_KEY, "testEXCEL-formats.xlsx");
        ContentHandler handler = new BodyContentHandler();

        try {
            parser.parse(input, handler, metadata);

            assertEquals(
                    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
                    metadata.get(Metadata.CONTENT_TYPE));
View Full Code Here

Examples of org.apache.tika.parser.AutoDetectParser

    public void testPowerPoint() throws Exception {
        InputStream input = OOXMLParserTest.class
                .getResourceAsStream("/test-documents/testPPT.pptx");

        Parser parser = new AutoDetectParser();
        Metadata metadata = new Metadata();
        // TODO: should auto-detect without the resource name
        metadata.set(Metadata.RESOURCE_NAME_KEY, "testPPT.pptx");
        ContentHandler handler = new BodyContentHandler();

        try {
            parser.parse(input, handler, metadata);

            assertEquals(
                    "application/vnd.openxmlformats-officedocument.presentationml.presentation",
                    metadata.get(Metadata.CONTENT_TYPE));
            assertEquals("Sample Powerpoint Slide", metadata.get(Metadata.TITLE));
View Full Code Here

Examples of org.apache.tika.parser.AutoDetectParser

    public void testWord() throws Exception {
        InputStream input = OOXMLParserTest.class
                .getResourceAsStream("/test-documents/testWORD.docx");

        Parser parser = new AutoDetectParser();
        Metadata metadata = new Metadata();
        // TODO: should auto-detect without the resource name
        metadata.set(Metadata.RESOURCE_NAME_KEY, "testWORD.docx");
        ContentHandler handler = new BodyContentHandler();

        try {
            parser.parse(input, handler, metadata);

            assertEquals(
                    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
                    metadata.get(Metadata.CONTENT_TYPE));
            assertEquals("Sample Word Document", metadata.get(Metadata.TITLE));
View Full Code Here

Examples of org.apache.tika.parser.AutoDetectParser

    private boolean pipeMode = true;

    public TikaCLI() throws TransformerConfigurationException {
        context = new ParseContext();
        parser = new AutoDetectParser();
        context.set(Parser.class, parser);
    }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.