Package org.carrot2.cli.batch

Source Code of org.carrot2.cli.batch.BatchApp

/*
* Carrot2 project.
*
* Copyright (C) 2002-2014, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/

package org.carrot2.cli.batch;

import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.List;
import java.util.Map;

import org.apache.commons.lang.StringUtils;
import org.carrot2.core.Controller;
import org.carrot2.core.ControllerFactory;
import org.carrot2.core.ProcessingComponentDescriptor;
import org.carrot2.core.ProcessingComponentDescriptor.ProcessingComponentDescriptorToId;
import org.carrot2.core.ProcessingComponentSuite;
import org.carrot2.core.ProcessingResult;
import org.carrot2.core.attribute.AttributeNames;
import org.carrot2.source.xml.XmlDocumentSource;
import org.carrot2.text.linguistic.DefaultLexicalDataFactory;
import org.carrot2.util.CloseableUtils;
import org.carrot2.util.ReflectionUtils;
import org.carrot2.util.attribute.AttributeUtils;
import org.carrot2.util.resource.DirLocator;
import org.carrot2.util.resource.FileResource;
import org.carrot2.util.resource.IResource;
import org.carrot2.util.resource.ResourceLookup;
import org.kohsuke.args4j.Argument;
import org.kohsuke.args4j.CmdLineException;
import org.kohsuke.args4j.CmdLineParser;
import org.kohsuke.args4j.Option;
import org.slf4j.Logger;

import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;

/**
* Carrot2 batch processing command line application.
*/
public class BatchApp
{
    private final Logger log = org.slf4j.LoggerFactory.getLogger("batch");

    @Option(name = "-v", aliases =
    {
        "--verbose"
    }, required = false, usage = "Print detailed messages")
    boolean verbose;

    @Option(name = "-o", aliases =
    {
        "--output-dir"
    }, required = false, metaVar = "DIR", usage = "Directory for output files")
    File outputDir = new File("output");

    enum Format
    {
        JSON, XML;
    }

    @Option(name = "-f", aliases =
    {
        "--output-format"
    }, required = false, usage = "Output format")
    Format outputFormat = Format.XML;

    @Option(name = "-d", aliases =
    {
        "--output-documents"
    }, required = false, usage = "Copies input documents on output")
    boolean outputDocuments = false;
   
    @Option(name = "-t", aliases =
    {
        "--output-attributes"
    }, required = false, usage = "Copies attribute values on ouput")
    boolean outputAttributes = false;

    @Option(name = "-a", aliases =
    {
        "--algorithm"
    }, required = false, metaVar = "ALGORITHM", usage = "Identifier or class name of the clustering algorithm to use, see below for the list")
    String algorithm;

    @Argument(metaVar = "INPUT", required = true, usage = "File in Carrot2 XML format or directory of files to cluster")
    List<File> inputFiles;

    int filesClusteredTotal = 0;
    int filesClusteredWithWarnings = 0;

    private ProcessingComponentSuite componentSuite;
    private List<ProcessingComponentDescriptor> algorithms;

    /**
     * Private constructor. Reads the available algorithms from the component suite.
     */
    private BatchApp() throws Exception
    {
        final File suitesDir = new File("suites");
        ResourceLookup suiteLookup = new ResourceLookup(
            new DirLocator(suitesDir));

        IResource suite = suiteLookup.getFirst("suite-batch.xml");
        if (suite == null)
            throw new RuntimeException(
                "Could not find suite-batch.xml in "
                    + suitesDir.getAbsolutePath());

        componentSuite = ProcessingComponentSuite.deserialize(suite, suiteLookup);

        algorithms = componentSuite.getAlgorithms();
        if (algorithms.isEmpty())
        {
            throw new RuntimeException(
                "Component suite does not contain any clustering algorithms.");
        }
    }

    /**
     * Processes all input.
     */
    private int process() throws Exception
    {
        final Controller controller = ControllerFactory.createPooling();
        final Map<String, Object> initAttributes = ImmutableMap.<String, Object> of(
            AttributeUtils.getKey(DefaultLexicalDataFactory.class, "resourceLookup"),
            new ResourceLookup(new DirLocator("resources")));

        controller.init(
            initAttributes,
            componentSuite.getComponentConfigurations());       

        // Prepare the algorithm
        if (StringUtils.isBlank(algorithm))
        {
            // Set the first algorithm as the default.
            algorithm = algorithms.get(0).getId();
        }
        else
        {
            // Check if the provided algorithm is valid
            try
            {
                ReflectionUtils.classForName(algorithm, false);
            }
            catch (ClassNotFoundException ignored)
            {
                // See if there's a corresponding algorithm in the suite
                final List<String> algorithmIds = Lists.transform(algorithms,
                    ProcessingComponentDescriptorToId.INSTANCE);
                if (!algorithmIds.contains(algorithm))
                {
                    log.warn("No such algorithm: " + algorithm
                        + ". Available algorithms: " + algorithmIds.toString());
                    return 20;
                }
            }
        }

        if (verbose)
        {
            log.info("Clustering with " + algorithm);
        }

        // Check if the output directory exists, create on if necessary
        if (!checkAndMakeDir(outputDir))
        {
            return 20;
        }

        // Process files in the order they were specified. For input directories,
        // a corresponding directory will be created on output.
        final long start = System.currentTimeMillis();
        for (File file : inputFiles)
        {
            try
            {
                process(file, outputDir, controller);
            }
            catch (Exception e)
            {
                processingWarning(file, e);
            }
        }

        log.info("Clustering of "
            + filesClusteredTotal
            + " files completed"
            + (filesClusteredWithWarnings > 0 ? " with " + filesClusteredWithWarnings
                + " warnings" : "") + " [" + (System.currentTimeMillis() - start)
            + " ms]");

        return filesClusteredWithWarnings == 0 ? 0 : 10;
    }

    /**
     * Files a processing warning.
     */
    private void processingWarning(File processedFile, Exception e)
    {
        filesClusteredWithWarnings++;
        final String message = "Failed to process "
            + (processedFile.isDirectory() ? "directory" : "file") + ": "
            + processedFile.getAbsolutePath();
        if (verbose)
        {
            log.warn(message, e);
        }
        else
        {
            log.warn(message);
        }
    }

    /**
     * Checks if a directory exists and attempts to create one.
     *
     * @return <code>false</code> if the directory cannot be created
     */
    private boolean checkAndMakeDir(final File dir)
    {
        if (dir.exists())
        {
            if (!dir.isDirectory())
            {
                log.warn("Output directory: " + dir.getAbsolutePath()
                    + " exists, but is not a directory");
                return false;
            }
        }
        else
        {
            if (!dir.mkdirs())
            {
                log.warn("Failed to create output directory: " + dir.getAbsolutePath());
                return false;
            }
        }

        return true;
    }

    /**
     * Processes an individual file or a directory.
     */
    private void process(File fileOrDirectory, File currentOutputDir,
        Controller controller) throws Exception
    {
        if (!fileOrDirectory.exists())
        {
            log.warn("File " + fileOrDirectory.getAbsolutePath() + " does not exist");
            return;
        }

        final String fileName = fileOrDirectory.getName();
        if (fileOrDirectory.isDirectory())
        {
            final File newCurrentOutputDir = new File(currentOutputDir, fileName);
            if (checkAndMakeDir(newCurrentOutputDir))
            {
                for (File fileOrDir : fileOrDirectory.listFiles())
                {
                    try
                    {
                        process(fileOrDir, newCurrentOutputDir, controller);
                    }
                    catch (Exception e)
                    {
                        processingWarning(fileOrDir, e);
                    }
                }
            }
        }
        else
        {
            filesClusteredTotal++;

            final Map<String, Object> attributes = Maps.newHashMap();
            attributes.put("XmlDocumentSource.xml", new FileResource(fileOrDirectory));
            final ProcessingResult result = controller.process(attributes,
                XmlDocumentSource.class.getName(), algorithm);

            // Stick to UTF-8 encoding on the output.
            final String outputFileName =
                Format.JSON.equals(outputFormat) && fileName.endsWith(".xml")
                ? fileName.substring(0, fileName .length() - 4) + ".json" : fileName;

            final OutputStream stream = new FileOutputStream(
                new File(currentOutputDir, outputFileName));
            try
            {
                if (Format.JSON.equals(outputFormat))
                {
                    Writer w = new OutputStreamWriter(stream, "UTF-8");
                    result.serializeJson(w, null, outputDocuments, true, outputAttributes);
                    w.flush();
                }
                else
                {
                    result.serialize(stream, outputDocuments, true, outputAttributes);
                }
            }
            finally
            {
                CloseableUtils.close(stream);
            }

            log.info("Clustering " + fileOrDirectory.getAbsolutePath() + " ["
                + result.getAttribute(AttributeNames.PROCESSING_TIME_TOTAL) + "ms]");
        }
    }

    public static void main(String [] args) throws Exception
    {
        final BatchApp batch = new BatchApp();

        final CmdLineParser parser = new CmdLineParser(batch);
        parser.setUsageWidth(80);

        try
        {
            parser.parseArgument(args);
            System.exit(batch.process());
        }
        catch (CmdLineException e)
        {
            System.out.print("Usage: batch");
            parser.printSingleLineUsage(System.out);
            System.out.println();
            parser.printUsage(System.out);

            System.out.println("\n" + e.getMessage());

            final List<String> algorithmIds = Lists.transform(batch.algorithms,
                ProcessingComponentDescriptorToId.INSTANCE);
            System.out.println("\nAvailable algorithms: " + algorithmIds.toString());
        }
    }
}
TOP

Related Classes of org.carrot2.cli.batch.BatchApp

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.