* Copyright © 2010-2013 Atilika Inc. and contributors (CONTRIBUTORS.txt)
* Atilika Inc. licenses this file to you under the Apache License, Version
* 2.0 (the "License"); you may not use this file except in compliance with
* the License. A copy of the License is distributed with this work in the
* LICENSE.txt file. You may also obtain a copy of the License from
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
package com.atilika.kuromoji.dict;
import com.atilika.kuromoji.ClassLoaderResolver;
import com.atilika.kuromoji.ResourceResolver;
import com.atilika.kuromoji.util.CSVUtil;
import java.io.BufferedInputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import java.nio.channels.WritableByteChannel;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class TokenInfoDictionary implements Dictionary {
public static final String FILENAME = "tid.dat";
public static final String TARGETMAP_FILENAME = "tid_map.dat";
public static final String PART_OF_SPEECH_FILENAME = "tid_pos.dat";
public static final int POS_OFFSET = 6;
public static final int SIZE_OFFSET = POS_OFFSET + 2;
public static final int FEATURE_OFFSET = SIZE_OFFSET + 2;
protected ByteBuffer buffer;
protected int[][] targetMap;
protected Map<String, Short> pos;
protected List<String> posList;
public TokenInfoDictionary() {
pos = new HashMap<String, Short>();
posList = new ArrayList<String>();
targetMap = new int[1][];
public TokenInfoDictionary(int size) {
buffer = ByteBuffer.allocate(size);
* put the entry in map
* @param entry
* @return current position of buffer, which will be wordId of next entry
public int put(String[] entry) {
int posStart = 4;
// Ugly hack for Jumandic, smaller features, only last field.
int featureStart = entry.length > 11 ? 10 : 7;//entry.length - 3;
featureStart = 10;
short leftId = Short.parseShort(entry[1]);
short rightId = Short.parseShort(entry[2]);
short wordCost = Short.parseShort(entry[3]);
String posFeatures = extractPosFeatures(entry, posStart, featureStart);
short partOfSpeechId = createPartOfSpeech(posFeatures);
String features = extractFeatures(entry, featureStart, entry.length);
int featuresSize = features.length()* 2;
int otherFieldSize = 2 * 5; // Buffer space needed by leftId, rightId, wordCost, partOfSpeechId and featuresSize
extendBufferIfNecessary(featuresSize + otherFieldSize);
for (char c : features.toCharArray()){
return buffer.position();
private String extractFeatures(String[] entry, int start, int end) {
StringBuilder sb = new StringBuilder();
int readingIndex = start + 1;
String baseForm = (end > start) ? entry[start] : null;
String reading = (end > readingIndex) ? entry[readingIndex] : null;
for (int i = start; i < end; i++) {
if (entry[i].equals(baseForm) && i > readingIndex) {
} else if (entry[i].equals(reading) && i > readingIndex) {
} else {
if (i < end - 1) {
return sb.toString();
private String extractPosFeatures(String[] entry, int start, int end) {
StringBuilder sb = new StringBuilder();
for (int i = start; i < end; i++) {
if (i < end - 1) {
return sb.toString();
private void extendBufferIfNecessary(int neededSize) {
int leftInBuffer = buffer.limit() - buffer.position();
if (neededSize > leftInBuffer) { // four short and features
ByteBuffer newBuffer = ByteBuffer.allocate(buffer.limit() * 2);
buffer = newBuffer;
protected short createPartOfSpeech(String features) {
Short posId = pos.get(features);
if (posId == null) {
posId = (short) pos.size();
pos.put(features, posId);
posList.add(posId, features);
return posId;
public void addMapping(int sourceId, int wordId) {
if(targetMap.length <= sourceId) {
int[][] newArray = new int[sourceId + 1][];
System.arraycopy(targetMap, 0, newArray, 0, targetMap.length);
targetMap = newArray;
// Prepare array -- extend the length of array by one
int[] current = targetMap[sourceId];
if (current == null) {
current = new int[1];
} else {
int[] newArray = new int[current.length + 1];
System.arraycopy(current, 0, newArray, 0, current.length);
current = newArray;
targetMap[sourceId] = current;
int[] targets = targetMap[sourceId];
targets[targets.length - 1] = wordId;
public int[] lookupWordIds(int sourceId) {
return targetMap[sourceId];
public int getLeftId(int wordId) {
return buffer.getShort(wordId);
public int getRightId(int wordId) {
return buffer.getShort(wordId + 2); // Skip left id
public int getWordCost(int wordId) {
return buffer.getShort(wordId + 4); // Skip left id and right id
public String[] getAllFeaturesArray(int wordId) {
List<String> features = new ArrayList<String>(16);
attachPosInfo(wordId, features);
attachFeatures(wordId, features);
return features.toArray(new String[features.size()]);
private void attachFeatures(int wordId, List<String> features) {
int size = buffer.getShort(wordId + SIZE_OFFSET) / 2; // Read length of feature String. Skip 6 bytes, see data structure.
int offset = wordId + FEATURE_OFFSET;
char[] charBuffer = new char[size];
int position = 0;
String reading = null;
String baseForm = null;
String feature = null;
for (int i = 0; i < size; i++) {
char c = buffer.getChar(offset + i * 2);
feature = new String(charBuffer, 0, position);
if (features.size() == 6) {
baseForm = feature;
} else if (features.size() == 7) {
reading = feature;
if (features.size() > 6) {
if (charBuffer[0] == REPEATED_TERM) {
feature = reading;
} else if (charBuffer[0] == REPEATED_BASEFORM) {
feature = baseForm;
position = 0;
} else {
charBuffer[position++] = c;
if (position > 0) {
feature = new String(charBuffer, 0, position);
if (features.size() > 7) {
if (charBuffer[0] == REPEATED_TERM) {
feature = reading;
} else if (charBuffer[0] == REPEATED_BASEFORM) {
feature = baseForm;
private void attachPosInfo(int wordId, List<String> features) {
int posDetail = buffer.getShort(wordId + POS_OFFSET);
String posInfo = posList.get(posDetail);
int size = posInfo.length();
char[] charBuffer = new char[size];
int position = 0;
for (int i = 0; i < size; i++){
char c = posInfo.charAt(i);
features.add(new String(charBuffer, 0, position));
position = 0;
} else {
charBuffer[position++] = c;
if (position > 0) {
features.add(new String(charBuffer, 0, position));
public String getFeature(int wordId, int... fields) {
String[] allFeatures = getAllFeaturesArray(wordId);
StringBuilder sb = new StringBuilder();
if(fields.length == 0){ // All features
for(String feature : allFeatures) {
} else if(fields.length == 1) { // One feature doesn't need to escape value
} else {
for(int field : fields){
return sb.deleteCharAt(sb.length() - 1).toString();
public String getReading(int wordId) {
return getFeature(wordId, 7);
public String getAllFeatures(int wordId) {
return getFeature(wordId);
public String getPartOfSpeech(int wordId) {
return getFeature(wordId, 0, 1, 2, 3);
public String getBaseForm(int wordId) {
return getFeature(wordId, 6);
* Write dictionary in file
* Dictionary format is:
* [Size of dictionary(int)], [entry:{left id(short)}{right id(short)}{word cost(short)}{length of pos info(short)}{pos info(char)}], [entry...], [entry...].....
* @param directoryName
* @throws IOException
public void write(String directoryName) throws IOException {
writeDictionary(directoryName + File.separator + FILENAME);
writeTargetMap(directoryName + File.separator + TARGETMAP_FILENAME);
writePosVector(directoryName + File.separator + PART_OF_SPEECH_FILENAME);
protected void writeDictionary(String filename) throws IOException {
FileOutputStream fos = new FileOutputStream(filename);
DataOutputStream dos = new DataOutputStream(fos);
WritableByteChannel channel = Channels.newChannel(fos);
// Write Buffer
buffer.flip(); // set position to 0, set limit to current position
* Read dictionary into directly allocated buffer.
* @return TokenInfoDictionary instance
* @throws IOException
* @throws ClassNotFoundException
public static TokenInfoDictionary newInstance(ResourceResolver resolver) throws IOException, ClassNotFoundException {
TokenInfoDictionary dictionary = new TokenInfoDictionary();
return dictionary;
public static TokenInfoDictionary newInstance() throws IOException, ClassNotFoundException {
return newInstance(new ClassLoaderResolver(TokenInfoDictionary.class));
protected void writeTargetMap(String filename) throws IOException {
DataOutputStream daos = new DataOutputStream(new FileOutputStream(filename));
// The array is mostly sparse so we'll save only non-null members.
for (int i = 0; i < targetMap.length; i++) {
if (targetMap[i] != null) {
int[] arr = targetMap[i];
for (int j : arr) daos.writeInt(j);
daos.writeInt(-1); // End index marker.
protected void writePosVector(String filename) throws IOException {
Writer writer = new OutputStreamWriter(new FileOutputStream(filename), "UTF-8");
for (String s : posList) {
protected void loadTargetMap(InputStream is) throws IOException, ClassNotFoundException {
DataInputStream dais = new DataInputStream(new BufferedInputStream(is));
targetMap = new int [dais.readInt()][];
int index;
while ((index = dais.readInt()) >= 0) {
int length = dais.readInt();
targetMap[index] = new int[length];
for (int j = 0; j < length; j++) {
targetMap[index][j] = dais.readInt();
protected void loadDictionary(InputStream is) throws IOException {
BufferedInputStream bis = new BufferedInputStream(is);
DataInputStream dis = new DataInputStream(bis);
int size = dis.readInt();
ByteBuffer tmpBuffer = ByteBuffer.allocateDirect(size);
ReadableByteChannel channel = Channels.newChannel(bis);
buffer = tmpBuffer.asReadOnlyBuffer();
protected void loadPosVector(InputStream is) throws IOException {
InputStreamReader isr = new InputStreamReader(new BufferedInputStream(is), "UTF-8");
LineNumberReader reader = new LineNumberReader(isr);
String line;
List<String> partOfSpeech = new ArrayList<String>();
while ((line = reader.readLine()) != null) {
posList = partOfSpeech;