Source Code of org.jasen.core.linguistics.LexicalTreeAnalyzer

/*
 * Copyright (c) 2004, 2005  jASEN.org
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   1. Redistributions of source code must retain the above copyright notice,
 *      this list of conditions and the following disclaimer.
 *
 *   2. Redistributions in binary form must reproduce the above copyright
 *      notice, this list of conditions and the following disclaimer in
 *      the documentation and/or other materials provided with the distribution.
 *
 *   3. The names of the authors may not be used to endorse or promote products
 *      derived from this software without specific prior written permission.
 *
 *   4. Any modification or additions to the software must be contributed back
 *      to the project.
 *
 *   5. Any investigation or reverse engineering of source code or binary to
 *      enable emails to bypass the filters, and hence inflict spam and or viruses
 *      onto users who use or do not use jASEN could subject the perpetrator to
 *      criminal and or civil liability.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JASEN.ORG,
 * OR ANY CONTRIBUTORS TO THIS SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 */
package org.jasen.core.linguistics;


import it.unimi.dsi.fastutil.chars.Char2ObjectOpenHashMap;


import java.io.IOException;
import java.util.Arrays;


import org.jasen.core.token.SimpleWordTokenizer;


/**
 * Employes a lexical tree approach to word recognition.
 * <p>
 * Based on a sample corpus, the analyser builds a tree of characters such that each characters in a word is a node in the tree.
 * </p>
 * <p>
 * When a word with a similar character sequence is found, the path to the next character is strengthened
 * </p>
 * @author Jason Polites
 */
public class LexicalTreeAnalyzer {


  private String[] tokens;
  private Char2ObjectOpenHashMap forwardTree;
  private Char2ObjectOpenHashMap reverseTree;
  private static final String ENGLISH_DICTIONARY_PATH = "org/jasen/core/linguistics/dictionaries/english.dic";


  /**
   * Creates and initialized the analyzer
   */
  public void initialize() throws IOException {
    // Get the dictionary as a resource stream
    SimpleWordTokenizer t = new SimpleWordTokenizer(this.getClass().getClassLoader().getResourceAsStream(ENGLISH_DICTIONARY_PATH));
    t.tokenize();
    tokens = t.getTokens();
    Arrays.sort(tokens);
    buildTrees();
  }


  /**
   * Computes the probability that the given sequence of characters is an English word.
   * <p>
   * This works on the premise that most English words exhibit a similar set of character sequence patterns 
   * in both their prefix, body and suffix.
   * </p>
   * The value of the word is determined by analysis if the characters in the word
   * against the values in both the forward and backward lexical trees.
   * <BR><BR>
   * The maximium possible value a word can have is 1 (100%), thus for each character in the word
   * which is correctly positioned in accordance with the rules in the tree, the computed value is
   * increased by 1/W where 'W' is the length of the word; such that if a word perfectly matches a
   * branch of the tree a result of 1/W x W (or 1) will be returned.
   * <BR><BR>
   * Where a word fails to match a forward branch perfectly, two things are done:
   * <OL>
   *   <LI>For each remaining character in the token, the current total is reduced by the same percentile fraction as used to calculate the total.</LI>
   *   <LI>The token is given a "second chance" by repeating the initial calculation process with the reverse tree.</LI>
   * </OL>
   * @param word The word to be tested
   * @return A value between 0.0 and 1.0 indicating the probability that the String is an English word.
   */
  public double computeWordValue(String word) {


    // First, check the dictionary
    word = word.toLowerCase();
    double result;


    if(Arrays.binarySearch(tokens, word) > -1) {
      result = 1.0d;
    }
    else
    {
      long time = System.currentTimeMillis();
      double percentile = (1.0d / (double)word.length());
      result = computeWordValue(forwardTree, word, true, 0, percentile, 0.0d);


      if(result < 0.0d) {
        result = 0.0d;
      }
    }
    return result;
  }




  /**
   * The value of the word is determined by analysis if the characters in the word
   * against the values in both the forward and backward lexical trees
   * <BR><BR>
   * The maximium possible value a word can have is 1 (100%), thus for each character in the word
   * which is correctly positioned in accordance with the rules in the tree, the computed value is
   * increased by 1/W where 'W' is the length of the word; such that if a word perfectly matches a
   * branch of the tree a result of 1/W x W (or 1) will be returned
   * <BR><BR>
   * Where a word fails to match a forward branch perfectly, two things are done:
   * <OL>
   *   <LI>For each remaining character in the token, the current total is reduced by the same percentile fraction as used to calculate the total</LI>
   *   <LI>The token is given a "second chance" by repeating the initial calculation process with the reverse tree</LI>
   * </OL>
   * @param node
   * @param word
   * @param index
   * @return
   */
  private double computeWordValue(Char2ObjectOpenHashMap node, String word, boolean forward, int index, double percentile, double total) {


    char chr = word.charAt(index);


    Char2ObjectOpenHashMap next = (Char2ObjectOpenHashMap)node.get(chr);


    if(next != null) {


      total += percentile;


      if(forward) {
        index++;


        if(index < word.length()) {
          total = computeWordValue(next, word, forward, index, percentile, total);
        }
      }
      else
      {
        index--;


        if(index >= 0) {
          total = computeWordValue(next, word, forward, index, percentile, total);
        }
      }
    }
    else if(index < word.length() - 1 && forward) {


      for (int i = index; i < word.length(); i++) {
        total -= percentile;
      }


      total = computeWordValue(reverseTree, word, false, (word.length() - 1), percentile, total);
    }


    return total;
  }


  /**
   * Loops through the list of tokens and builds a character tree
   *
   */
  private void buildTrees() {
    // First, create a root node
    forwardTree = new Char2ObjectOpenHashMap();
    reverseTree = new Char2ObjectOpenHashMap();


    String token = null;
    char chr;


    Char2ObjectOpenHashMap currentNode = null;
    Char2ObjectOpenHashMap targetNode = null;




    for (int i = 0; i < tokens.length; i++) {
      token = tokens[i];


      if(token != null) {


        // Build the forward tree
        currentNode = forwardTree;


        for (int j = 0; j < token.length(); j++) {
          chr = token.charAt(j);


          // Attempt to locate the node from the current node
          targetNode = (Char2ObjectOpenHashMap)currentNode.get(chr);


          if(targetNode == null) {
            targetNode = new Char2ObjectOpenHashMap(1);
            //targetNode.setCharacter(chr);
            currentNode.put(chr, targetNode);
          }
          currentNode = targetNode;
        }


        // Build the reverse tree
        currentNode = reverseTree;


        for (int j = token.length() - 1; j >= 0; j--) {
          chr = token.charAt(j);


          // Attempt to locate the node from the current node
          targetNode =(Char2ObjectOpenHashMap)currentNode.get(chr);


          if(targetNode == null) {
            targetNode = new Char2ObjectOpenHashMap(1);
            //targetNode.setCharacter(chr);
            currentNode.put(chr, targetNode);
          }


          currentNode = targetNode;
        }
      }
    }
  }
}
Source Code of org.jasen.core.linguistics.LexicalTreeAnalyzer

Related Classes of org.jasen.core.linguistics.LexicalTreeAnalyzer