Source Code of com.atilika.kuromoji.util.TokenInfoDictionaryBuilder

/**
 * Copyright 2010-2013 Atilika Inc. and contributors (see CONTRIBUTORS.md)
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.  A copy of the
 * License is distributed with this work in the LICENSE.md file.  You may
 * also obtain a copy of the License from
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.atilika.kuromoji.util;


import com.atilika.kuromoji.dict.TokenInfoDictionary;
import com.atilika.kuromoji.util.DictionaryBuilder.DictionaryFormat;


import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.text.Normalizer;
import java.util.ArrayList;
import java.util.List;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


public class TokenInfoDictionaryBuilder {


    private TreeMap<Integer, String> dictionaryEntries = new TreeMap<Integer, String>(); // wordId, surface form


    private String encoding;


    private boolean normalizeEntries;


    private boolean addUnnormalizedEntries;


    private Pattern dictionaryFilter;


    private Formatter formatter;


    private int bufferOffset = 0; // Internal word id - incrementally assigned as entries are read and added (byte offset in the dictionary file)


    private TokenInfoDictionary dictionary;


    public TokenInfoDictionaryBuilder(DictionaryFormat format, String encoding, boolean normalizeEntries, boolean addUnnormalizedEntries, String dictionaryFilter) {
        if (format == DictionaryFormat.UNIDIC) {
            this.formatter = new UnidicFormatter();
        } else if (format == DictionaryFormat.KOREAN) {
            this.formatter = new KoreanFormatter();
        } else {
            this.formatter = new IpadicFormatter();
        }


        this.encoding = encoding;
        this.normalizeEntries = normalizeEntries;
        this.addUnnormalizedEntries = addUnnormalizedEntries;
        if (dictionaryFilter != null && !dictionaryFilter.isEmpty()) {
            this.dictionaryFilter = Pattern.compile(dictionaryFilter);
        }


        dictionary = new TokenInfoDictionary(10 * 1024 * 1024);


    }


    public TokenInfoDictionary build(String dirname) throws IOException {
        FilenameFilter filter = new FilenameFilter() {
            @Override
            public boolean accept(File dir, String name) {
                return name.endsWith(".csv");
            }
        };


        ArrayList<File> csvFiles = new ArrayList<File>();
        for (File file : new File(dirname).listFiles(filter)) {
            csvFiles.add(file);
        }
        return buildDictionary(csvFiles);
    }


    public TokenInfoDictionary buildDictionary(List<File> csvFiles) throws IOException {
        TokenInfoDictionary dictionary = new TokenInfoDictionary(10 * 1024 * 1024); // Start with 10MB buffer (can grow)
        int offset = 0; // Internal word id - incrementally assigned as entries are read and added (byte offset in the dictionary file)


        for (File file : csvFiles) {
            BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), encoding));
            String line;


            while ((line = reader.readLine()) != null) {


                if (isSkipEntry(line)) {
                    System.out.println("Skipping line: " + line);
                    continue;
                }


                String[] entry = CSVUtil.parse(line);


//                if (entry.length < 13) {
//                    System.out.println("Entry in CSV is not valid: " + line);
//                    continue;
//                }


                if (normalizeEntries) {
                    String[] normalizedEntry = normalizeEntry(entry);
                    String normalizedLineEntry = CSVUtil.unparse(normalizedEntry);


                    if (isSkipEntry(normalizedLineEntry)) {
                        System.out.println("Skipping line because it normalised to a skip: " + line); // + " (normalize line is: " + normalizedLineEntry + ")");
                        continue;
                    }


                    offset = addEntry(normalizedEntry, dictionary, dictionaryEntries, offset);


                    if (!isNormalized(entry[0]) && addUnnormalizedEntries) {
                        offset = addEntry(entry, dictionary, dictionaryEntries, offset);
                    }
                } else {
                    offset = addEntry(entry, dictionary, dictionaryEntries, offset);
                }
            }
            reader.close();
        }
        return dictionary;
    }


    private boolean isSkipEntry(String lineEntry) {


        if (dictionaryFilter == null) {
            return false;
        }


        Matcher matcher = dictionaryFilter.matcher(lineEntry);
        if (matcher.find()) {
            return true;
        }


        return false;
    }


    private int addEntry(String[] entry, TokenInfoDictionary dictionary, TreeMap<Integer, String> entries, int offset) {
        entries.put(offset, entry[0]);
        return dictionary.put(formatter.formatEntry(entry));
    }


    private String[] normalizeEntry(String[] entry) {
        String[] normalizedEntry = new String[entry.length];


        for (int i = 0; i < entry.length; i++) {
            normalizedEntry[i] = normalize(entry[i]);
        }
        return normalizedEntry;
    }


    private boolean isNormalized(String input) {
        return input.equals(normalize(input));
    }


    private String normalize(String input) {
        return Normalizer.normalize(input, Normalizer.Form.NFKC);
    }


    public Set<Entry<Integer, String>> entrySet() {
        return dictionaryEntries.entrySet();
    }
}
Source Code of com.atilika.kuromoji.util.TokenInfoDictionaryBuilder

Related Classes of com.atilika.kuromoji.util.TokenInfoDictionaryBuilder