Source Code of com.atilika.kuromoji.dict.TokenInfoDictionary

/**
 * Copyright © 2010-2013 Atilika Inc. and contributors (CONTRIBUTORS.txt)
 *
 * Atilika Inc. licenses this file to you under the Apache License, Version
 * 2.0 (the "License"); you may not use this file except in compliance with
 * the License.  A copy of the License is distributed with this work in the
 * LICENSE.txt file.  You may also obtain a copy of the License from
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
 * License for the specific language governing permissions and limitations
 * under the License.
 */
package com.atilika.kuromoji.dict;


import com.atilika.kuromoji.ClassLoaderResolver;
import com.atilika.kuromoji.ResourceResolver;
import com.atilika.kuromoji.util.CSVUtil;


import java.io.BufferedInputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import java.nio.channels.WritableByteChannel;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;


public class TokenInfoDictionary implements Dictionary {


  public static final String FILENAME = "tid.dat";


  public static final String TARGETMAP_FILENAME = "tid_map.dat";


    public static final String PART_OF_SPEECH_FILENAME = "tid_pos.dat";


    public static final int POS_OFFSET = 6;


    public static final int SIZE_OFFSET = POS_OFFSET + 2;


    public static final int FEATURE_OFFSET = SIZE_OFFSET + 2;


  protected ByteBuffer buffer;


  protected int[][] targetMap;


    protected Map<String, Short> pos;


    protected List<String> posList;
    
  public TokenInfoDictionary() {
        pos = new HashMap<String, Short>();
        posList = new ArrayList<String>();
        targetMap = new int[1][];
  }


  public TokenInfoDictionary(int size) {
        this();
    buffer = ByteBuffer.allocate(size);
  }


  /**
   * put the entry in map
   * @param entry
   * @return current position of buffer, which will be wordId of next entry
   */
  public int put(String[] entry) {
        int posStart = 4;
        // Ugly hack for Jumandic, smaller features, only last field.
        int featureStart = entry.length > 11 ? 10 : 7;//entry.length - 3;


        featureStart = 10;


        short leftId = Short.parseShort(entry[1]);
    short rightId = Short.parseShort(entry[2]);
    short wordCost = Short.parseShort(entry[3]);


        String posFeatures = extractPosFeatures(entry, posStart, featureStart);
        short partOfSpeechId = createPartOfSpeech(posFeatures);
        String features = extractFeatures(entry, featureStart, entry.length);
        int featuresSize = features.length()* 2;
        int otherFieldSize = 2 * 5; // Buffer space needed by leftId, rightId, wordCost, partOfSpeechId and featuresSize


        extendBufferIfNecessary(featuresSize + otherFieldSize);


        buffer.putShort(leftId);
        buffer.putShort(rightId);
        buffer.putShort(wordCost);


        buffer.putShort(partOfSpeechId);


    buffer.putShort((short)featuresSize);


        for (char c : features.toCharArray()){
      buffer.putChar(c);
    }


    return buffer.position();
  }


    private String extractFeatures(String[] entry, int start, int end) {
        StringBuilder sb = new StringBuilder();


        int readingIndex = start + 1;
        String baseForm = (end > start) ? entry[start] : null;
        String reading = (end > readingIndex) ? entry[readingIndex] : null;


        for (int i = start; i < end; i++) {
            if (entry[i].equals(baseForm) && i > readingIndex) {
                sb.append(REPEATED_BASEFORM);
            } else if (entry[i].equals(reading) && i > readingIndex) {
                sb.append(REPEATED_TERM);
            } else {
                sb.append(entry[i]);
            }


            if (i < end - 1) {
                sb.append(INTERNAL_SEPARATOR);
            }
        }


        return sb.toString();
    }


    private String extractPosFeatures(String[] entry, int start, int end) {
        StringBuilder sb = new StringBuilder();
        for (int i = start; i < end; i++) {
            sb.append(entry[i]);


            if (i < end - 1) {
                sb.append(INTERNAL_SEPARATOR);
            }
        }
    return sb.toString();
    }


    private void extendBufferIfNecessary(int neededSize) {
        int leftInBuffer = buffer.limit() - buffer.position();


        if (neededSize > leftInBuffer) { // four short and features
            ByteBuffer newBuffer = ByteBuffer.allocate(buffer.limit() * 2);
            buffer.flip();
            newBuffer.put(buffer);
            buffer = newBuffer;
        }
    }


    protected short createPartOfSpeech(String features) {
        Short posId = pos.get(features);
        
        if (posId == null) {
            posId = (short) pos.size();
            pos.put(features, posId);
            posList.add(posId, features);
        }
        return posId;
    }


    public void addMapping(int sourceId, int wordId) {
    if(targetMap.length <= sourceId) {
      int[][] newArray = new int[sourceId + 1][];
      System.arraycopy(targetMap, 0, newArray, 0, targetMap.length);
      targetMap = newArray;
    }


    // Prepare array -- extend the length of array by one
    int[] current = targetMap[sourceId];
    if (current == null) {
      current = new int[1];
    } else {
      int[] newArray = new int[current.length + 1];
      System.arraycopy(current, 0, newArray, 0, current.length);
      current = newArray;
    }
    targetMap[sourceId] = current;


    int[] targets = targetMap[sourceId];
    targets[targets.length - 1] = wordId;
  }


  public int[] lookupWordIds(int sourceId) {
    return targetMap[sourceId];
  }


  @Override
  public int getLeftId(int wordId) {
    return buffer.getShort(wordId);
  }


  @Override
  public int getRightId(int wordId) {
    return buffer.getShort(wordId + 2);  // Skip left id
  }


  @Override
  public int getWordCost(int wordId) {
    return buffer.getShort(wordId + 4);  // Skip left id and right id
  }




    @Override
    public String[] getAllFeaturesArray(int wordId) {
        List<String> features = new ArrayList<String>(16);


        attachPosInfo(wordId, features);
        attachFeatures(wordId, features);


        return features.toArray(new String[features.size()]);
    }


    private void attachFeatures(int wordId, List<String> features) {
        int size = buffer.getShort(wordId + SIZE_OFFSET) / 2; // Read length of feature String. Skip 6 bytes, see data structure.
        int offset = wordId + FEATURE_OFFSET;
        char[] charBuffer = new char[size];
        int position = 0;


        String reading = null;
        String baseForm = null;
        String feature = null;


        for (int i = 0; i < size; i++) {
            char c = buffer.getChar(offset + i * 2);
            if (c == INTERNAL_SEPARATOR) {
                feature = new String(charBuffer, 0, position);
                if (features.size() == 6) {
                    baseForm = feature;
                } else if (features.size() == 7) {
                    reading = feature;
                }
                if (features.size() > 6) {
                    if (charBuffer[0] == REPEATED_TERM) {
                        feature = reading;
                    } else if (charBuffer[0] == REPEATED_BASEFORM) {
                        feature = baseForm;
                    }
                }
                features.add(feature);
                position = 0;
            } else {
                charBuffer[position++] = c;
            }
        }


        if (position > 0) {
            feature = new String(charBuffer, 0, position);
            if (features.size() > 7) {
                if (charBuffer[0] == REPEATED_TERM) {
                    feature = reading;
                } else if (charBuffer[0] == REPEATED_BASEFORM) {
                    feature = baseForm;
                }
            }
            features.add(feature);
        }
    }


    private void attachPosInfo(int wordId, List<String> features) {
        int posDetail = buffer.getShort(wordId + POS_OFFSET);
        String posInfo = posList.get(posDetail);


        int size = posInfo.length();
        char[] charBuffer = new char[size];
        int position = 0;


        for (int i = 0; i < size; i++){
            char c = posInfo.charAt(i);
            if (c == INTERNAL_SEPARATOR) {
                features.add(new String(charBuffer, 0, position));
                position = 0;
            } else {
                charBuffer[position++] = c;
            }
        }


        if (position > 0) {
            features.add(new String(charBuffer, 0, position));
        }
    }


    @Override
  public String getFeature(int wordId, int... fields) {
    String[] allFeatures = getAllFeaturesArray(wordId);
    StringBuilder sb = new StringBuilder();


    if(fields.length == 0){ // All features
      for(String feature : allFeatures) {
        sb.append(CSVUtil.quoteEscape(feature)).append(",");
      }
    } else if(fields.length == 1) { // One feature doesn't need to escape value
      sb.append(allFeatures[fields[0]]).append(",");
    } else {
      for(int field : fields){
        sb.append(CSVUtil.quoteEscape(allFeatures[field])).append(",");
      }
    }


    return sb.deleteCharAt(sb.length() - 1).toString();
  }


  @Override
  public String getReading(int wordId) {
    return getFeature(wordId, 7);
  }


  @Override
  public String getAllFeatures(int wordId) {
    return getFeature(wordId);
  }


  @Override
  public String getPartOfSpeech(int wordId) {
    return getFeature(wordId, 0, 1, 2, 3);
  }


  @Override
  public String getBaseForm(int wordId) {
    return getFeature(wordId, 6);
  }


  /**
   * Write dictionary in file
   * Dictionary format is:
   * [Size of dictionary(int)], [entry:{left id(short)}{right id(short)}{word cost(short)}{length of pos info(short)}{pos info(char)}], [entry...], [entry...].....
   * @param directoryName
   * @throws IOException
   */
  public void write(String directoryName) throws IOException {
    writeDictionary(directoryName + File.separator + FILENAME);
    writeTargetMap(directoryName + File.separator + TARGETMAP_FILENAME);
        writePosVector(directoryName + File.separator + PART_OF_SPEECH_FILENAME);
  }


    protected void writeDictionary(String filename) throws IOException {
    FileOutputStream fos = new FileOutputStream(filename);
    DataOutputStream dos = new DataOutputStream(fos);
    dos.writeInt(buffer.position());
    WritableByteChannel channel = Channels.newChannel(fos);
    // Write Buffer
    buffer.flip();  // set position to 0, set limit to current position
    channel.write(buffer);
    fos.close();
  }


  /**
   * Read dictionary into directly allocated buffer.
   * @return TokenInfoDictionary instance
   * @throws IOException
   * @throws ClassNotFoundException
   */
  public static TokenInfoDictionary newInstance(ResourceResolver resolver) throws IOException, ClassNotFoundException {
    TokenInfoDictionary dictionary = new TokenInfoDictionary();
    dictionary.loadDictionary(resolver.resolve(FILENAME));
    dictionary.loadTargetMap(resolver.resolve(TARGETMAP_FILENAME));
        dictionary.loadPosVector(resolver.resolve(PART_OF_SPEECH_FILENAME));
    return dictionary;
  }


    public static TokenInfoDictionary newInstance() throws IOException, ClassNotFoundException {
        return newInstance(new ClassLoaderResolver(TokenInfoDictionary.class));
    }


  protected void writeTargetMap(String filename) throws IOException {
    DataOutputStream daos = new DataOutputStream(new FileOutputStream(filename));
    daos.writeInt(targetMap.length);
    // The array is mostly sparse so we'll save only non-null members.
    for (int i = 0; i < targetMap.length; i++) {
      if (targetMap[i] != null) {
        int[] arr = targetMap[i];
        daos.writeInt(i);
        daos.writeInt(arr.length);
        for (int j : arr) daos.writeInt(j);
      }
    }
    daos.writeInt(-1); // End index marker.
    daos.close();
  }


    protected void writePosVector(String filename) throws IOException {
        Writer writer = new OutputStreamWriter(new FileOutputStream(filename), "UTF-8");
        for (String s : posList) {
            writer.write(s);
            writer.write('\n');
        }
        writer.close();
    }


  protected void loadTargetMap(InputStream is) throws IOException, ClassNotFoundException {
    DataInputStream dais = new DataInputStream(new BufferedInputStream(is));
    targetMap = new int [dais.readInt()][];
    int index;
    while ((index = dais.readInt()) >= 0) {
      int length = dais.readInt();
      targetMap[index] = new int[length];
      for (int j = 0; j < length; j++) {
        targetMap[index][j] = dais.readInt();
      }
    }
  }


  protected void loadDictionary(InputStream is) throws IOException {
        BufferedInputStream bis = new BufferedInputStream(is);
    DataInputStream dis = new DataInputStream(bis);
    int size = dis.readInt();


    ByteBuffer tmpBuffer = ByteBuffer.allocateDirect(size);


    ReadableByteChannel channel = Channels.newChannel(bis);
    channel.read(tmpBuffer);
    dis.close();
    buffer = tmpBuffer.asReadOnlyBuffer();
  }


    protected void loadPosVector(InputStream is) throws IOException {
        InputStreamReader isr = new InputStreamReader(new BufferedInputStream(is), "UTF-8");
        LineNumberReader reader = new LineNumberReader(isr);
        String line;
        List<String> partOfSpeech = new ArrayList<String>();
        while ((line = reader.readLine()) != null) {
            partOfSpeech.add(line);
        }
        posList = partOfSpeech;
        isr.close();
    }
}
Source Code of com.atilika.kuromoji.dict.TokenInfoDictionary

Related Classes of com.atilika.kuromoji.dict.TokenInfoDictionary