Package com.digitalpebble.behemoth.tika

Source Code of com.digitalpebble.behemoth.tika.TikaDriver

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.digitalpebble.behemoth.tika;

import com.digitalpebble.behemoth.BehemothConfiguration;
import com.digitalpebble.behemoth.BehemothDocument;
import com.digitalpebble.behemoth.BehemothReducer;

import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.Option;
import org.apache.commons.cli2.OptionException;
import org.apache.commons.cli2.builder.ArgumentBuilder;
import org.apache.commons.cli2.builder.DefaultOptionBuilder;
import org.apache.commons.cli2.builder.GroupBuilder;
import org.apache.commons.cli2.commandline.Parser;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.List;

public class TikaDriver extends Configured implements Tool, TikaConstants {
    private transient static Logger log = LoggerFactory
            .getLogger(TikaDriver.class);

    public TikaDriver() {
        super(null);
    }

    public TikaDriver(Configuration conf) {
        super(conf);
    }

    public static void main(String args[]) throws Exception {
        int res = ToolRunner.run(BehemothConfiguration.create(),
                new TikaDriver(), args);
        System.exit(res);
    }

    public int run(String[] args) throws Exception {

        final FileSystem fs = FileSystem.get(getConf());
        GroupBuilder gBuilder = new GroupBuilder().withName("Options:");
        List<Option> options = new ArrayList<Option>();
        Option inputOpt = buildOption("input", "i", "The input path", true,
                true, null);
        options.add(inputOpt);
        Option outOpt = buildOption("output", "o", "The output path", true,
                true, null);
        options.add(outOpt);
        Option tikaOpt = buildOption(
                "tikaProcessor",
                "t",
                "The fully qualified name of a TikaProcessor class that handles the extraction",
                true, false, null);
        options.add(tikaOpt);
        Option mimeTypeOpt = buildOption("mimeType", "m",
                "The mime type to use", true, false, "");
        options.add(mimeTypeOpt);
        for (Option opt : options) {
            gBuilder = gBuilder.withOption(opt);
        }

        Group group = gBuilder.create();

        try {
            Parser parser = new Parser();
            parser.setGroup(group);
            // TODO catch exceptions with parsing of opts
            CommandLine cmdLine = parser.parse(args);
            Path inputPath = new Path(cmdLine.getValue(inputOpt).toString());
            Path outputPath = new Path(cmdLine.getValue(outOpt).toString());
            String handlerName = null;
            if (cmdLine.hasOption(tikaOpt)) {
                handlerName = cmdLine.getValue(tikaOpt).toString();
            }

            JobConf job = new JobConf(getConf());
            job.setJarByClass(this.getClass());

            if (cmdLine.hasOption(mimeTypeOpt)) {
                String mimeType = cmdLine.getValue(mimeTypeOpt).toString();
                job.set(TikaConstants.TIKA_MIME_TYPE_KEY, mimeType);
            }

            if (handlerName != null && handlerName.equals("") == false) {
                job.set(TIKA_PROCESSOR_KEY, handlerName);
            }

            job.setJobName("Tika : " + inputPath.toString());

            job.setInputFormat(SequenceFileInputFormat.class);
            job.setOutputFormat(SequenceFileOutputFormat.class);

            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(BehemothDocument.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(BehemothDocument.class);

            job.setMapperClass(TikaMapper.class);

            boolean isFilterRequired = BehemothReducer.isRequired(job);
            if (isFilterRequired)
                job.setReducerClass(BehemothReducer.class);
            else {
                job.setNumReduceTasks(0);
            }

            FileInputFormat.addInputPath(job, inputPath);
            FileOutputFormat.setOutputPath(job, outputPath);

            try {
                JobClient.runJob(job);
            } catch (Exception e) {
                log.error("Exception", e);
                return -1;
                // don't delete the output as some of it could be used
                // fs.delete(outputPath, true);
            } finally {
            }

        } catch (OptionException e) {
            log.error("OptionException", e);
            return -1;
        }

        return 0;
    }

    // taken from Mahout AbstractJob
    private Option buildOption(String name, String shortName,
            String description, boolean hasArg, boolean required,
            String defaultValue) {

        DefaultOptionBuilder optBuilder = new DefaultOptionBuilder()
                .withLongName(name).withDescription(description)
                .withRequired(required);

        if (shortName != null) {
            optBuilder.withShortName(shortName);
        }

        if (hasArg) {
            ArgumentBuilder argBuilder = new ArgumentBuilder().withName(name)
                    .withMinimum(1).withMaximum(1);

            if (defaultValue != null) {
                argBuilder = argBuilder.withDefault(defaultValue);
            }

            optBuilder.withArgument(argBuilder.create());
        }

        return optBuilder.create();
    }

}
TOP

Related Classes of com.digitalpebble.behemoth.tika.TikaDriver

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.