Package org.jasen.core.linguistics

Source Code of org.jasen.core.linguistics.LexicalTreeAnalyzer

/*
* Copyright (c) 2004, 2005  jASEN.org
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
*   1. Redistributions of source code must retain the above copyright notice,
*      this list of conditions and the following disclaimer.
*
*   2. Redistributions in binary form must reproduce the above copyright
*      notice, this list of conditions and the following disclaimer in
*      the documentation and/or other materials provided with the distribution.
*
*   3. The names of the authors may not be used to endorse or promote products
*      derived from this software without specific prior written permission.
*
*   4. Any modification or additions to the software must be contributed back
*      to the project.
*
*   5. Any investigation or reverse engineering of source code or binary to
*      enable emails to bypass the filters, and hence inflict spam and or viruses
*      onto users who use or do not use jASEN could subject the perpetrator to
*      criminal and or civil liability.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JASEN.ORG,
* OR ANY CONTRIBUTORS TO THIS SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*/
package org.jasen.core.linguistics;

import it.unimi.dsi.fastutil.chars.Char2ObjectOpenHashMap;

import java.io.IOException;
import java.util.Arrays;

import org.jasen.core.token.SimpleWordTokenizer;

/**
* Employes a lexical tree approach to word recognition.
* <p>
* Based on a sample corpus, the analyser builds a tree of characters such that each characters in a word is a node in the tree.
* </p>
* <p>
* When a word with a similar character sequence is found, the path to the next character is strengthened
* </p>
* @author Jason Polites
*/
public class LexicalTreeAnalyzer {

  private String[] tokens;
  private Char2ObjectOpenHashMap forwardTree;
  private Char2ObjectOpenHashMap reverseTree;
  private static final String ENGLISH_DICTIONARY_PATH = "org/jasen/core/linguistics/dictionaries/english.dic";

  /**
   * Creates and initialized the analyzer
   */
  public void initialize() throws IOException {
    // Get the dictionary as a resource stream
    SimpleWordTokenizer t = new SimpleWordTokenizer(this.getClass().getClassLoader().getResourceAsStream(ENGLISH_DICTIONARY_PATH));
    t.tokenize();
    tokens = t.getTokens();
    Arrays.sort(tokens);
    buildTrees();
  }

  /**
   * Computes the probability that the given sequence of characters is an English word.
   * <p>
   * This works on the premise that most English words exhibit a similar set of character sequence patterns
   * in both their prefix, body and suffix.
   * </p>
   * The value of the word is determined by analysis if the characters in the word
   * against the values in both the forward and backward lexical trees.
   * <BR><BR>
   * The maximium possible value a word can have is 1 (100%), thus for each character in the word
   * which is correctly positioned in accordance with the rules in the tree, the computed value is
   * increased by 1/W where 'W' is the length of the word; such that if a word perfectly matches a
   * branch of the tree a result of 1/W x W (or 1) will be returned.
   * <BR><BR>
   * Where a word fails to match a forward branch perfectly, two things are done:
   * <OL>
   *   <LI>For each remaining character in the token, the current total is reduced by the same percentile fraction as used to calculate the total.</LI>
   *   <LI>The token is given a "second chance" by repeating the initial calculation process with the reverse tree.</LI>
   * </OL>
   * @param word The word to be tested
   * @return A value between 0.0 and 1.0 indicating the probability that the String is an English word.
   */
  public double computeWordValue(String word) {

    // First, check the dictionary
    word = word.toLowerCase();
    double result;

    if(Arrays.binarySearch(tokens, word) > -1) {
      result = 1.0d;
    }
    else
    {
      long time = System.currentTimeMillis();
      double percentile = (1.0d / (double)word.length());
      result = computeWordValue(forwardTree, word, true, 0, percentile, 0.0d);

      if(result < 0.0d) {
        result = 0.0d;
      }
    }
    return result;
  }


  /**
   * The value of the word is determined by analysis if the characters in the word
   * against the values in both the forward and backward lexical trees
   * <BR><BR>
   * The maximium possible value a word can have is 1 (100%), thus for each character in the word
   * which is correctly positioned in accordance with the rules in the tree, the computed value is
   * increased by 1/W where 'W' is the length of the word; such that if a word perfectly matches a
   * branch of the tree a result of 1/W x W (or 1) will be returned
   * <BR><BR>
   * Where a word fails to match a forward branch perfectly, two things are done:
   * <OL>
   *   <LI>For each remaining character in the token, the current total is reduced by the same percentile fraction as used to calculate the total</LI>
   *   <LI>The token is given a "second chance" by repeating the initial calculation process with the reverse tree</LI>
   * </OL>
   * @param node
   * @param word
   * @param index
   * @return
   */
  private double computeWordValue(Char2ObjectOpenHashMap node, String word, boolean forward, int index, double percentile, double total) {

    char chr = word.charAt(index);

    Char2ObjectOpenHashMap next = (Char2ObjectOpenHashMap)node.get(chr);

    if(next != null) {

      total += percentile;

      if(forward) {
        index++;

        if(index < word.length()) {
          total = computeWordValue(next, word, forward, index, percentile, total);
        }
      }
      else
      {
        index--;

        if(index >= 0) {
          total = computeWordValue(next, word, forward, index, percentile, total);
        }
      }
    }
    else if(index < word.length() - 1 && forward) {

      for (int i = index; i < word.length(); i++) {
        total -= percentile;
      }

      total = computeWordValue(reverseTree, word, false, (word.length() - 1), percentile, total);
    }

    return total;
  }

  /**
   * Loops through the list of tokens and builds a character tree
   *
   */
  private void buildTrees() {
    // First, create a root node
    forwardTree = new Char2ObjectOpenHashMap();
    reverseTree = new Char2ObjectOpenHashMap();

    String token = null;
    char chr;

    Char2ObjectOpenHashMap currentNode = null;
    Char2ObjectOpenHashMap targetNode = null;


    for (int i = 0; i < tokens.length; i++) {
      token = tokens[i];

      if(token != null) {

        // Build the forward tree
        currentNode = forwardTree;

        for (int j = 0; j < token.length(); j++) {
          chr = token.charAt(j);

          // Attempt to locate the node from the current node
          targetNode = (Char2ObjectOpenHashMap)currentNode.get(chr);

          if(targetNode == null) {
            targetNode = new Char2ObjectOpenHashMap(1);
            //targetNode.setCharacter(chr);
            currentNode.put(chr, targetNode);
          }
          currentNode = targetNode;
        }

        // Build the reverse tree
        currentNode = reverseTree;

        for (int j = token.length() - 1; j >= 0; j--) {
          chr = token.charAt(j);

          // Attempt to locate the node from the current node
          targetNode =(Char2ObjectOpenHashMap)currentNode.get(chr);

          if(targetNode == null) {
            targetNode = new Char2ObjectOpenHashMap(1);
            //targetNode.setCharacter(chr);
            currentNode.put(chr, targetNode);
          }

          currentNode = targetNode;
        }
      }
    }
  }
}
TOP

Related Classes of org.jasen.core.linguistics.LexicalTreeAnalyzer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.