Package com.atilika.kuromoji.dict

Examples of com.atilika.kuromoji.dict.TokenInfoDictionary


        this.addUnnormalizedEntries = addUnnormalizedEntries;
        if (dictionaryFilter != null && !dictionaryFilter.isEmpty()) {
            this.dictionaryFilter = Pattern.compile(dictionaryFilter);
        }

        dictionary = new TokenInfoDictionary(10 * 1024 * 1024);

    }
View Full Code Here


        }
        return buildDictionary(csvFiles);
    }

    public TokenInfoDictionary buildDictionary(List<File> csvFiles) throws IOException {
        TokenInfoDictionary dictionary = new TokenInfoDictionary(10 * 1024 * 1024); // Start with 10MB buffer (can grow)
        int offset = 0; // Internal word id - incrementally assigned as entries are read and added (byte offset in the dictionary file)

        for (File file : csvFiles) {
            BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), encoding));
            String line;
View Full Code Here

  }

    private void buildTokenInfoDictionary(DictionaryFormat format, String inputDirname, String outputDirname, String encoding, boolean normalizeEntries, boolean addUnnormalizedEntries, boolean compactTrie, String dictionaryFilter) throws IOException {
        System.out.println("building tokeninfo dict...");
        TokenInfoDictionaryBuilder tokenInfoBuilder = new TokenInfoDictionaryBuilder(format, encoding, normalizeEntries, addUnnormalizedEntries, dictionaryFilter);
        TokenInfoDictionary tokenInfoDictionary = tokenInfoBuilder.build(inputDirname);

        System.out.print("  building double array trie...");
        DoubleArrayTrie trie = DoubleArrayTrieBuilder.build(tokenInfoBuilder.entrySet(), compactTrie);
        trie.write(outputDirname);
        System.out.println("  done");

        System.out.print("  processing target map...");
        for (Entry<Integer, String> entry : tokenInfoBuilder.entrySet()) {
            int tokenInfoId = entry.getKey();
            String surfaceForm = entry.getValue();
            int doubleArrayId = trie.lookup(surfaceForm);
            assert doubleArrayId > 0;
            tokenInfoDictionary.addMapping(doubleArrayId, tokenInfoId);
        }
        tokenInfoDictionary.write(outputDirname);

        System.out.println("done");
        System.out.println("done");
    }
View Full Code Here

TOP

Related Classes of com.atilika.kuromoji.dict.TokenInfoDictionary

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.