Package org.apache.tika.example

Source Code of org.apache.tika.example.DumpTikaConfigExample

package org.apache.tika.example;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.language.translate.DefaultTranslator;
import org.apache.tika.language.translate.Translator;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.StringWriter;
import java.io.Writer;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;


/**
* This class shows how to dump a TikaConfig object to a configuration file.
* This allows users to easily dump the default TikaConfig as a base from which
* to start if they want to modify the default configuration file.
* <p>
* For those who want to modify the mimes file, take a look at
* tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
* for inspiration.  Consider adding org/apache/tika/mime/custom-mimetypes.xml
* for your custom mime types.
*/
public class DumpTikaConfigExample {

    /**
     *
     * @param config config file to dump
     * @param writer writer to which to write
     * @throws Exception
     */
    public void dump(TikaConfig config, Writer writer, String encoding) throws Exception {
        DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
        DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
        // root elements
        Document doc = docBuilder.newDocument();
        Element rootElement = doc.createElement("properties");

        doc.appendChild(rootElement);
        addMimeComment(rootElement, doc);
        addTranslator(rootElement, doc, config);
        addDetectors(rootElement, doc, config);
        addParsers(rootElement, doc, config);


        //now write
        TransformerFactory transformerFactory = TransformerFactory.newInstance();
        Transformer transformer = transformerFactory.newTransformer();
        transformer.setOutputProperty(OutputKeys.INDENT, "yes");
        transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "2");
        transformer.setOutputProperty(OutputKeys.ENCODING, encoding);
        DOMSource source = new DOMSource(doc);
        StreamResult result = new StreamResult(writer);

        transformer.transform(source, result);
    }

    private void addTranslator(Element rootElement, Document doc, TikaConfig config) {
        //TikaConfig only reads the first translator from the list,
        //but it looks like it expects a list
        Translator translator = config.getTranslator();
        if (translator instanceof DefaultTranslator) {
            Node mimeComment = doc.createComment(
                    "for example: "+
                            "<translator class=\"org.apache.tika.language.translate.GoogleTranslator\"/>");
            rootElement.appendChild(mimeComment);
        } else {
            Element translatorElement = doc.createElement("translator");
            translatorElement.setAttribute("class", translator.getClass().getCanonicalName());
            rootElement.appendChild(translatorElement);
        }
    }

    private void addMimeComment(Element rootElement, Document doc) {
        Node mimeComment = doc.createComment(
                "for example: <mimeTypeRepository resource=\"/org/apache/tika/mime/tika-mimetypes.xml\"/>");
        rootElement.appendChild(mimeComment);
    }

    private void addDetectors(Element rootElement, Document doc, TikaConfig config) throws Exception {
        Detector detector = config.getDetector();
        Element detectorsElement = doc.createElement("detectors");

        if (detector instanceof DefaultDetector) {
            List<Detector> children = ((DefaultDetector)detector).getDetectors();
            for (Detector d : children) {
                Element detectorElement = doc.createElement("detector");
                detectorElement.setAttribute("class", d.getClass().getCanonicalName());
                detectorsElement.appendChild(detectorElement);
            }
        }
        rootElement.appendChild(detectorsElement);
    }

    private void addParsers(Element rootElement, Document doc, TikaConfig config) throws Exception {
        Map<String, Parser> parsers = getConcreteParsers(config.getParser());

        Element parsersElement = doc.createElement("parsers");
        rootElement.appendChild(parsersElement);

        ParseContext context = new ParseContext();
        for (Map.Entry<String, Parser> e : parsers.entrySet()) {
            Element parserElement = doc.createElement("parser");
            Parser child = e.getValue();
            String className = e.getKey();
            parserElement.setAttribute("class", className);
            Set<MediaType> types = new TreeSet<MediaType>();
            types.addAll(child.getSupportedTypes(context));
            for (MediaType type : types){
                Element mimeElement = doc.createElement("mime");
                mimeElement.appendChild(doc.createTextNode(type.toString()));
                parserElement.appendChild(mimeElement);
            }
            parsersElement.appendChild(parserElement);
        }
        rootElement.appendChild(parsersElement);

    }

    private Map<String, Parser> getConcreteParsers(Parser parentParser)throws TikaException, IOException  {
        Map<String, Parser> parsers = new TreeMap<String, Parser>();
        if (parentParser instanceof CompositeParser) {
            addParsers((CompositeParser)parentParser, parsers);
        } else {
            addParser(parentParser, parsers);
        }
        return parsers;
    }

    private void addParsers(CompositeParser p, Map<String, Parser> parsers) {
        for (Parser child : p.getParsers().values()) {
            if (child instanceof CompositeParser) {
                addParsers((CompositeParser)child, parsers);
            } else {
                addParser(child, parsers);
            }
        }
    }

    private void addParser(Parser p, Map<String, Parser> parsers) {
        parsers.put(p.getClass().getCanonicalName(), p);
    }

    /**
     *
     * @param args outputFile, outputEncoding, if args is empty, this prints to console
     * @throws Exception
     */
    public static void main(String[] args) throws Exception {

        String encoding = "UTF-8";
        Writer writer = null;
        if (args.length > 0) {
            writer = new OutputStreamWriter(new FileOutputStream(new File(args[0])));
        } else {
            writer = new StringWriter();
        }

        if (args.length > 1) {
            encoding = args[1];
        }
        DumpTikaConfigExample ex = new DumpTikaConfigExample();
        ex.dump(TikaConfig.getDefaultConfig(), writer, encoding);

        writer.flush();

        if (writer instanceof StringWriter) {
            System.out.println(writer.toString());
        }
        writer.close();
    }
}
TOP

Related Classes of org.apache.tika.example.DumpTikaConfigExample

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.