Package com.cybozu.labs.langdetect

Source Code of com.cybozu.labs.langdetect.Command

/*
* Copyright 2011 Nakatani Shuyo
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.cybozu.labs.langdetect;

import java.io.File;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.ObjectOutputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;

import be.frma.langguess.IOUtils;
import be.frma.langguess.LangProfileFactory;

import com.cybozu.labs.langdetect.util.LangProfile;

/**
*
* LangDetect Command Line Interface
* <p>
* This is a command line interface of Language Detection Library "LandDetect".
*
*
* @author Nakatani Shuyo
* @author Francois ROLAND
*
*/
public class Command {
    /** smoothing default parameter (ELE) */
    private static final double DEFAULT_ALPHA = 0.5;

    /** for Command line easy parser */
    private HashMap<String, String> opt_with_value = new HashMap<String, String>();
    private HashMap<String, String> values = new HashMap<String, String>();
    private HashSet<String> opt_without_value = new HashSet<String>();
    private ArrayList<String> arglist = new ArrayList<String>();

    /**
     * Command line easy parser
     * @param args command line arguments
     */
    private void parse(String[] args) {
        for(int i=0;i<args.length;++i) {
            if (opt_with_value.containsKey(args[i])) {
                String key = opt_with_value.get(args[i]);
                values.put(key, args[i+1]);
                ++i;
            } else if (args[i].startsWith("-")) {
                opt_without_value.add(args[i]);
            } else {
                arglist.add(args[i]);
            }
        }
    }

    private void addOpt(String opt, String key, String value) {
        opt_with_value.put(opt, key);
        values.put(key, value);
    }
    private String get(String key) {
        return values.get(key);
    }
    private Long getLong(String key) {
        String value = values.get(key);
        if (value == null) return null;
        try {
            return Long.valueOf(value);
        } catch (NumberFormatException e) {
            return null;
        }
    }
    private double getDouble(String key, double defaultValue) {
        try {
            return Double.valueOf(values.get(key));
        } catch (NumberFormatException e) {
            return defaultValue;
        }
    }

    private boolean hasOpt(String opt) {
        return opt_without_value.contains(opt);
    }

       
    /**
     * File search (easy glob)
     * @param directory directory path
     * @param pattern   searching file pattern with regular representation
     * @return matched file
     */
    private File searchFile(File directory, String pattern) {
        for(File file : directory.listFiles()) {
            if (file.getName().matches(pattern)) return file;
        }
        return null;
    }


    /**
     * load profiles
     * @return false if load success
     */
    private boolean loadProfile() {
        String profileDirectory = get("directory") + "/";
        try {
            DetectorFactory.loadProfile(profileDirectory);
            Long seed = getLong("seed");
            if (seed != null) DetectorFactory.setSeed(seed);
            return false;
        } catch (LangDetectException e) {
            System.err.println("ERROR: " + e.getMessage());
            return true;
        }
    }
   
    /**
     * Generate Language Profile from a text file.
     *
     * <pre>
     * usage: --genprofile [text file] [language name]
     * </pre>
     *
     */
    public void generateProfile() {
        File directory = new File(arglist.get(0));
        String lang = arglist.get(1);
        File file = searchFile(directory, lang + "wiki-.*-abstract\\.xml.*");
        if (file == null) {
            System.err.println("Not Found text file : lang = " + lang);
            return;
        }

        ObjectOutputStream os = null;
        try {
            LangProfile profile = GenProfile.load(lang, file);
            profile.omitLessFreq();
            LangProfileFactory.writeProfile(profile, new FileOutputStream(new File(lang)));
        } catch (IOException e) {
            e.printStackTrace();
        } catch (LangDetectException e) {
            e.printStackTrace();
        } finally {
          IOUtils.closeQuietly(os);
        }
    }

    /**
     * Language detection test for each file (--detectlang option)
     *
     * <pre>
     * usage: --detectlang -d [profile directory] -a [alpha] -s [seed] [test file(s)]
     * </pre>
     *
     */
    public void detectLang() {
        if (loadProfile()) return;
        for (String filename: arglist) {
            BufferedReader is = null;
            try {
                is = new BufferedReader(new InputStreamReader(new FileInputStream(filename), "utf-8"));

                Detector detector = DetectorFactory.create(getDouble("alpha", DEFAULT_ALPHA));
                if (hasOpt("--debug")) detector.setVerbose();
                detector.append(is);
                System.out.println(filename + ":" + detector.getProbabilities());
            } catch (IOException e) {
                e.printStackTrace();
            } catch (LangDetectException e) {
                e.printStackTrace();
            } finally {
                IOUtils.closeQuietly(is);
            }

        }
    }

    /**
     * Batch Test of Language Detection (--batchtest option)
     *
     * <pre>
     * usage: --batchtest -d [profile directory] -a [alpha] -s [seed] [test data(s)]
     * </pre>
     *
     * The format of test data(s):
     * <pre>
     *   [correct language name]\t[text body for test]\n
     * </pre>
    
     */
    public void batchTest() {
        if (loadProfile()) return;
        HashMap<String, ArrayList<String>> result = new HashMap<String, ArrayList<String>>();
        for (String filename: arglist) {
            BufferedReader is = null;
            try {
                is = new BufferedReader(new InputStreamReader(new FileInputStream(filename), "utf-8"));
                while (is.ready()) {
                    String line = is.readLine();
                    int idx = line.indexOf('\t');
                    if (idx <= 0) continue;
                    String correctLang = line.substring(0, idx);
                    String text = line.substring(idx + 1);
                   
                    Detector detector = DetectorFactory.create(getDouble("alpha", DEFAULT_ALPHA));
                    detector.append(text);
/*
                    for(int j=0;j<text.length();++j) {
                        detector.append(text.charAt(j));
                        if (detector.isConvergence()) break;
                    }
*/
                    String lang = detector.detect();
                    if (!result.containsKey(correctLang)) result.put(correctLang, new ArrayList<String>());
                    result.get(correctLang).add(lang);
                    if (hasOpt("--debug")) System.out.println(correctLang + "," + lang + "," + (text.length()>100?text.substring(0, 100):text));
                }
               
            } catch (IOException e) {
                e.printStackTrace();
            } catch (LangDetectException e) {
                e.printStackTrace();
            } finally {
                IOUtils.closeQuietly(is);
            }

            ArrayList<String> langlist = new ArrayList<String>(result.keySet());
            Collections.sort(langlist);

            int totalCount = 0, totalCorrect = 0;
            for ( String lang :langlist) {
                HashMap<String, Integer> resultCount = new HashMap<String, Integer>();
                int count = 0;
                ArrayList<String> list = result.get(lang);
                for (String detectedLang: list) {
                    ++count;
                    if (resultCount.containsKey(detectedLang)) {
                        resultCount.put(detectedLang, resultCount.get(detectedLang) + 1);
                    } else {
                        resultCount.put(detectedLang, 1);
                    }
                }
                int correct = resultCount.containsKey(lang)?resultCount.get(lang):0;
                double rate = correct / (double)count;
                System.out.println(String.format("%s (%d/%d=%.2f): %s", lang, correct, count, rate, resultCount));
                totalCorrect += correct;
                totalCount += count;
            }
            System.out.println(String.format("total: %d/%d = %.3f", totalCorrect, totalCount, totalCorrect / (double)totalCount));
           
        }
       
    }

    /**
     * Command Line Interface
     * @param args command line arguments
     */
    public static void main(String[] args) {
        Command command = new Command();
        command.addOpt("-d", "directory", "./");
        command.addOpt("-a", "alpha", "" + DEFAULT_ALPHA);
        command.addOpt("-s", "seed", null);
        command.parse(args);

        if (command.hasOpt("--genprofile")) {
            command.generateProfile();
        } else if (command.hasOpt("--detectlang")) {
            command.detectLang();
        } else if (command.hasOpt("--batchtest")) {
            command.batchTest();
        }
    }
}
TOP

Related Classes of com.cybozu.labs.langdetect.Command

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.