/*
* Copyright (c) 2004, 2005 jASEN.org
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the distribution.
*
* 3. The names of the authors may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* 4. Any modification or additions to the software must be contributed back
* to the project.
*
* 5. Any investigation or reverse engineering of source code or binary to
* enable emails to bypass the filters, and hence inflict spam and or viruses
* onto users who use or do not use jASEN could subject the perpetrator to
* criminal and or civil liability.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JASEN.ORG,
* OR ANY CONTRIBUTORS TO THIS SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*/
package org.jasen.core.linguistics;
import it.unimi.dsi.fastutil.chars.Char2ObjectOpenHashMap;
import java.io.IOException;
import java.util.Arrays;
import org.jasen.core.token.SimpleWordTokenizer;
/**
* Employes a lexical tree approach to word recognition.
* <p>
* Based on a sample corpus, the analyser builds a tree of characters such that each characters in a word is a node in the tree.
* </p>
* <p>
* When a word with a similar character sequence is found, the path to the next character is strengthened
* </p>
* @author Jason Polites
*/
public class LexicalTreeAnalyzer {
private String[] tokens;
private Char2ObjectOpenHashMap forwardTree;
private Char2ObjectOpenHashMap reverseTree;
private static final String ENGLISH_DICTIONARY_PATH = "org/jasen/core/linguistics/dictionaries/english.dic";
/**
* Creates and initialized the analyzer
*/
public void initialize() throws IOException {
// Get the dictionary as a resource stream
SimpleWordTokenizer t = new SimpleWordTokenizer(this.getClass().getClassLoader().getResourceAsStream(ENGLISH_DICTIONARY_PATH));
t.tokenize();
tokens = t.getTokens();
Arrays.sort(tokens);
buildTrees();
}
/**
* Computes the probability that the given sequence of characters is an English word.
* <p>
* This works on the premise that most English words exhibit a similar set of character sequence patterns
* in both their prefix, body and suffix.
* </p>
* The value of the word is determined by analysis if the characters in the word
* against the values in both the forward and backward lexical trees.
* <BR><BR>
* The maximium possible value a word can have is 1 (100%), thus for each character in the word
* which is correctly positioned in accordance with the rules in the tree, the computed value is
* increased by 1/W where 'W' is the length of the word; such that if a word perfectly matches a
* branch of the tree a result of 1/W x W (or 1) will be returned.
* <BR><BR>
* Where a word fails to match a forward branch perfectly, two things are done:
* <OL>
* <LI>For each remaining character in the token, the current total is reduced by the same percentile fraction as used to calculate the total.</LI>
* <LI>The token is given a "second chance" by repeating the initial calculation process with the reverse tree.</LI>
* </OL>
* @param word The word to be tested
* @return A value between 0.0 and 1.0 indicating the probability that the String is an English word.
*/
public double computeWordValue(String word) {
// First, check the dictionary
word = word.toLowerCase();
double result;
if(Arrays.binarySearch(tokens, word) > -1) {
result = 1.0d;
}
else
{
long time = System.currentTimeMillis();
double percentile = (1.0d / (double)word.length());
result = computeWordValue(forwardTree, word, true, 0, percentile, 0.0d);
if(result < 0.0d) {
result = 0.0d;
}
}
return result;
}
/**
* The value of the word is determined by analysis if the characters in the word
* against the values in both the forward and backward lexical trees
* <BR><BR>
* The maximium possible value a word can have is 1 (100%), thus for each character in the word
* which is correctly positioned in accordance with the rules in the tree, the computed value is
* increased by 1/W where 'W' is the length of the word; such that if a word perfectly matches a
* branch of the tree a result of 1/W x W (or 1) will be returned
* <BR><BR>
* Where a word fails to match a forward branch perfectly, two things are done:
* <OL>
* <LI>For each remaining character in the token, the current total is reduced by the same percentile fraction as used to calculate the total</LI>
* <LI>The token is given a "second chance" by repeating the initial calculation process with the reverse tree</LI>
* </OL>
* @param node
* @param word
* @param index
* @return
*/
private double computeWordValue(Char2ObjectOpenHashMap node, String word, boolean forward, int index, double percentile, double total) {
char chr = word.charAt(index);
Char2ObjectOpenHashMap next = (Char2ObjectOpenHashMap)node.get(chr);
if(next != null) {
total += percentile;
if(forward) {
index++;
if(index < word.length()) {
total = computeWordValue(next, word, forward, index, percentile, total);
}
}
else
{
index--;
if(index >= 0) {
total = computeWordValue(next, word, forward, index, percentile, total);
}
}
}
else if(index < word.length() - 1 && forward) {
for (int i = index; i < word.length(); i++) {
total -= percentile;
}
total = computeWordValue(reverseTree, word, false, (word.length() - 1), percentile, total);
}
return total;
}
/**
* Loops through the list of tokens and builds a character tree
*
*/
private void buildTrees() {
// First, create a root node
forwardTree = new Char2ObjectOpenHashMap();
reverseTree = new Char2ObjectOpenHashMap();
String token = null;
char chr;
Char2ObjectOpenHashMap currentNode = null;
Char2ObjectOpenHashMap targetNode = null;
for (int i = 0; i < tokens.length; i++) {
token = tokens[i];
if(token != null) {
// Build the forward tree
currentNode = forwardTree;
for (int j = 0; j < token.length(); j++) {
chr = token.charAt(j);
// Attempt to locate the node from the current node
targetNode = (Char2ObjectOpenHashMap)currentNode.get(chr);
if(targetNode == null) {
targetNode = new Char2ObjectOpenHashMap(1);
//targetNode.setCharacter(chr);
currentNode.put(chr, targetNode);
}
currentNode = targetNode;
}
// Build the reverse tree
currentNode = reverseTree;
for (int j = token.length() - 1; j >= 0; j--) {
chr = token.charAt(j);
// Attempt to locate the node from the current node
targetNode =(Char2ObjectOpenHashMap)currentNode.get(chr);
if(targetNode == null) {
targetNode = new Char2ObjectOpenHashMap(1);
//targetNode.setCharacter(chr);
currentNode.put(chr, targetNode);
}
currentNode = targetNode;
}
}
}
}
}