Package org.apache.lucene.analysis.ja.dict

Source Code of org.apache.lucene.analysis.ja.dict.TestTokenInfoDictionary

package org.apache.lucene.analysis.ja.dict;

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

import org.apache.lucene.analysis.ja.util.ToStringUtil;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.IntsRefFSTEnum;
import org.apache.lucene.util.fst.IntsRefFSTEnum.InputOutput;

public class TestTokenInfoDictionary extends LuceneTestCase {

  /** enumerates the entire FST/lookup data and just does basic sanity checks */
  public void testEnumerateAll() throws Exception {
    // just for debugging
    int numTerms = 0;
    int numWords = 0;
    int lastWordId = -1;
    int lastSourceId = -1;
    TokenInfoDictionary tid = TokenInfoDictionary.getInstance();
    ConnectionCosts matrix = ConnectionCosts.getInstance();
    FST<Long> fst = tid.getFST().getInternalFST();
    IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<Long>(fst);
    InputOutput<Long> mapping;
    IntsRef scratch = new IntsRef();
    while ((mapping = fstEnum.next()) != null) {
      numTerms++;
      IntsRef input = mapping.input;
      char chars[] = new char[input.length];
      for (int i = 0; i < chars.length; i++) {
        chars[i] = (char)input.ints[input.offset+i];
      }
      assertTrue(UnicodeUtil.validUTF16String(new String(chars)));
     
      Long output = mapping.output;
      int sourceId = output.intValue();
      // we walk in order, terms, sourceIds, and wordIds should always be increasing
      assertTrue(sourceId > lastSourceId);
      lastSourceId = sourceId;
      tid.lookupWordIds(sourceId, scratch);
      for (int i = 0; i < scratch.length; i++) {
        numWords++;
        int wordId = scratch.ints[scratch.offset+i];
        assertTrue(wordId > lastWordId);
        lastWordId = wordId;
        
        String baseForm = tid.getBaseForm(wordId, chars, 0, chars.length);
        assertTrue(baseForm == null || UnicodeUtil.validUTF16String(baseForm));
       
        String inflectionForm = tid.getInflectionForm(wordId);
        assertTrue(inflectionForm == null || UnicodeUtil.validUTF16String(inflectionForm));
        if (inflectionForm != null) {
          // check that its actually an ipadic inflection form
          assertNotNull(ToStringUtil.getInflectedFormTranslation(inflectionForm));         
        }
       
        String inflectionType = tid.getInflectionType(wordId);
        assertTrue(inflectionType == null || UnicodeUtil.validUTF16String(inflectionType));
        if (inflectionType != null) {
          // check that its actually an ipadic inflection type
          assertNotNull(ToStringUtil.getInflectionTypeTranslation(inflectionType));
        }
       
        int leftId = tid.getLeftId(wordId);
        int rightId = tid.getRightId(wordId);
       
        matrix.get(rightId, leftId);
       
        tid.getWordCost(wordId);
       
        String pos = tid.getPartOfSpeech(wordId);
        assertNotNull(pos);
        assertTrue(UnicodeUtil.validUTF16String(pos));
        // check that its actually an ipadic pos tag
        assertNotNull(ToStringUtil.getPOSTranslation(pos));
       
        String pronunciation = tid.getPronunciation(wordId, chars, 0, chars.length);
        assertNotNull(pronunciation);
        assertTrue(UnicodeUtil.validUTF16String(pronunciation));
       
        String reading = tid.getReading(wordId, chars, 0, chars.length);
        assertNotNull(reading);
        assertTrue(UnicodeUtil.validUTF16String(reading));
      }
    }
    if (VERBOSE) {
      System.out.println("checked " + numTerms + " terms, " + numWords + " words.");
    }
  }
}
TOP

Related Classes of org.apache.lucene.analysis.ja.dict.TestTokenInfoDictionary

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.