Package org.apache.lucene.analysis.ja.util

Source Code of org.apache.lucene.analysis.ja.util.UnknownDictionaryBuilder

package org.apache.lucene.analysis.ja.util;

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;

import org.apache.lucene.analysis.ja.dict.CharacterDefinition;

public class UnknownDictionaryBuilder {
  private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,5,5,-32768,記号,一般,*,*,*,*,*,*,*";
 
  private String encoding = "euc-jp";
 
  public UnknownDictionaryBuilder(String encoding) {
    this.encoding = encoding;
  }
 
  public UnknownDictionaryWriter build(String dirname) throws IOException {
    UnknownDictionaryWriter unkDictionary = readDictionaryFile(dirname + File.separator + "unk.def")//Should be only one file
    readCharacterDefinition(dirname + File.separator + "char.def", unkDictionary);
    return unkDictionary;
  }
 
  public UnknownDictionaryWriter readDictionaryFile(String filename)
      throws IOException {
    return readDictionaryFile(filename, encoding);
  }
 
  public UnknownDictionaryWriter readDictionaryFile(String filename, String encoding)
      throws IOException {
    UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(5 * 1024 * 1024);
   
    FileInputStream inputStream = new FileInputStream(filename);
    Charset cs = Charset.forName(encoding);
    CharsetDecoder decoder = cs.newDecoder()
        .onMalformedInput(CodingErrorAction.REPORT)
        .onUnmappableCharacter(CodingErrorAction.REPORT);
    InputStreamReader streamReader = new InputStreamReader(inputStream, decoder);
    LineNumberReader lineReader = new LineNumberReader(streamReader);
   
    dictionary.put(CSVUtil.parse(NGRAM_DICTIONARY_ENTRY));
   
    List<String[]> lines = new ArrayList<String[]>();
    String line = null;
    while ((line = lineReader.readLine()) != null) {
      // note: unk.def only has 10 fields, it simplifies the writer to just append empty reading and pronunciation,
      // even though the unknown dictionary returns hardcoded null here.
      final String[] parsed = CSVUtil.parse(line + ",*,*"); // Probably we don't need to validate entry
      lines.add(parsed);
    }
   
    Collections.sort(lines, new Comparator<String[]>() {
      public int compare(String[] left, String[] right) {
        int leftId = CharacterDefinition.lookupCharacterClass(left[0]);
        int rightId = CharacterDefinition.lookupCharacterClass(right[0]);
        return leftId - rightId;
      }
    });
   
    for (String[] entry : lines) {
      dictionary.put(entry);
    }
   
    return dictionary;
  }
 
  public void readCharacterDefinition(String filename, UnknownDictionaryWriter dictionary) throws IOException {
    FileInputStream inputStream = new FileInputStream(filename);
    InputStreamReader streamReader = new InputStreamReader(inputStream, encoding);
    LineNumberReader lineReader = new LineNumberReader(streamReader);
   
    String line = null;
   
    while ((line = lineReader.readLine()) != null) {
      line = line.replaceAll("^\\s", "");
      line = line.replaceAll("\\s*#.*", "");
      line = line.replaceAll("\\s+", " ");
     
      // Skip empty line or comment line
      if(line.length() == 0) {
        continue;
      }
     
      if(line.startsWith("0x")) {  // Category mapping
        String[] values = line.split(" ", 2)// Split only first space
       
        if(!values[0].contains("..")) {
          int cp = Integer.decode(values[0]).intValue();
          dictionary.putCharacterCategory(cp, values[1]);
        } else {
          String[] codePoints = values[0].split("\\.\\.");
          int cpFrom = Integer.decode(codePoints[0]).intValue();
          int cpTo = Integer.decode(codePoints[1]).intValue();
         
          for(int i = cpFrom; i <= cpTo; i++){
            dictionary.putCharacterCategory(i, values[1]);
          }
        }
      } else // Invoke definition
        String[] values = line.split(" "); // Consecutive space is merged above
        String characterClassName = values[0];
        int invoke = Integer.parseInt(values[1]);
        int group = Integer.parseInt(values[2]);
        int length = Integer.parseInt(values[3]);
        dictionary.putInvokeDefinition(characterClassName, invoke, group, length);
      }
    }
  }
}
TOP

Related Classes of org.apache.lucene.analysis.ja.util.UnknownDictionaryBuilder

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.