Package nlp.com.knowledgebooks.nlp

Source Code of nlp.com.knowledgebooks.nlp.ExtractNames

package nlp.com.knowledgebooks.nlp;

import nlp.com.knowledgebooks.nlp.util.ScoredList;
import nlp.com.knowledgebooks.nlp.util.Tokenizer;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.OutputStreamWriter;
import java.util.Arrays;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.List;

/**
* Wrapper for code to find both human and place names in input text.
*
* <p/>
* Copyright 2002-2008 by Mark Watson. All rights reserved.
* <p/>
* <p/>
* Copyright 1998-2012 by Mark Watson. All rights reserved.
* <p/>
* This software is can be used under either of the following licenses:
* <p/>
* 1. LGPL v3<br/>
* 2. Apache 2
* <p/>
*/
public class ExtractNames {
    /**
     *        Facade method: get all place and human names from a text string
     * @param words
     * @return
     */
    public ScoredList[] getProperNames(List<String> words) {
        ScoredList placeNames = new ScoredList();
        ScoredList humanNames = new ScoredList();
        ScoredList[] ret = new ScoredList[2];
        ret[0] = humanNames; ret[1] = placeNames;
        if (words == null) return ret;
        for (int i=0; i<words.size(); i++) {
            // 5 word human names:
            if (isHumanName(words, i, 5)) {
                String s = words.get(i) + " " + words.get(i+1) + " " + words.get(i+2) + " " + words.get(i+3) + " " + words.get(i+4);
                humanNames.addValue(s);
                i += 4;
                continue;
            }
            // 4 word human names:
            if (isHumanName(words, i, 4)) {
                String s = words.get(i) + " " + words.get(i+1) + " " + words.get(i+2) + " " + words.get(i+3);
                humanNames.addValue(s);
                i += 3;
                continue;
            }
            // 3 word names:
            if (isPlaceName(words, i, 3)) {
                String s = words.get(i) + " " + words.get(i+1) + " " + words.get(i+2);
                placeNames.addValue(s);
                i += 2;
                continue;
            }
            if (isHumanName(words, i, 3)) {
                String s = words.get(i) + " " + words.get(i+1) + " " + words.get(i+2);
                humanNames.addValue(s);
                i += 2;
                continue;
            }
            // 2 word names:
            if (isPlaceName(words, i, 2)) {
                String s = words.get(i) + " " + words.get(i+1);
                placeNames.addValue(s);
                i += 1;
                continue;
            }
            if (isHumanName(words, i, 2)) {
                String s = words.get(i) + " " + words.get(i+1);
                humanNames.addValue(s);
                i += 1;
                continue;
            }
            // 1 word names:
            if (isPlaceName(words, i, 1)) {
                placeNames.addValue(words.get(i));
                continue;
            }
        }
        return ret;
    }
    /**
     *
     * @param s
     * @return
     */
    public ScoredList[] getProperNames(String s) {
        List<String> words = Tokenizer.wordsToList(s);
        return getProperNames(words);
    }

    /**
     *
     * @param words
     * @param startIndex
     * @param numWords
     * @return
     */
    public boolean isPlaceName(List<String> words, int startIndex, int numWords) {
        if ((startIndex + numWords) > words.size())  return false;
        if (numWords == 1) return isPlaceName(words.get(startIndex));
        String s = "";
        for (int i=startIndex; i<(startIndex + numWords); i++) {
            if (i < (startIndex + numWords - 1)) s = s + words.get(startIndex) + " ";
            else                                 s = s + words.get(startIndex);
        }
        return isPlaceName(s);
    }

    /**
     *
     * @param name
     * @return
     */
    public boolean isPlaceName(String name) {
        if (placeNameHash.get(name)!=null) System.out.println("* place name: "+name+", placeNameHash.get(name): "+placeNameHash.get(name));
        return placeNameHash.get(name) != null;
    }

    /**
     *
     * @param s
     * @return
     */
    public boolean isHumanName(String s) {
        List<String> ss = Tokenizer.wordsToList(s);
        //System.out.print("Tokens: "); for (int i=0; i<ss.length; i++) System.out.print(ss[i] + " "); System.out.println();
        if (ss == nullreturn false;
        return isHumanName(ss);
    }


    /**
     *
     * @param words
     * @param index
     * @param numWords
     * @return
     */
    public boolean isHumanName(List<String> words, int index, int numWords) {
        if ((index + numWords) > words.size())  return false;
        if (numWords == 1) {
            return isHumanName(Arrays.asList(words.get(index)));
        }
        if (numWords == 2) {
            return isHumanName(Arrays.asList(words.get(index), words.get(index+1)));
        }
        if (numWords == 3) {
            return isHumanName(Arrays.asList(words.get(index), words.get(index+1), words.get(index+2)));
        }
        if (numWords == 4) {
            return isHumanName(Arrays.asList(words.get(index), words.get(index+1), words.get(index+2), words.get(index+3)));
        }
        if (numWords == 5) {
            return isHumanName(Arrays.asList(words.get(index), words.get(index+1), words.get(index+2), words.get(index+3), words.get(index+4)));
        }
        return false;
    }

    /**
     *
     * @param words
     * @return
     */
    public boolean isHumanName(List<String> words) {
        int len = words.size();
        if (len == 1) {
            if (lastNameHash.get(words.get(0)) != null) return true;
        } else if (len == 2) {
            if (firstNameHash.get(words.get(0)) != null && lastNameHash.get(words.get(1)) != null) return true;
            if (prefixHash.get(words.get(0))    != null && lastNameHash.get(words.get(1)) != null) return true;
        } else if (len == 3) {
            if (firstNameHash.get(words.get(0)) != null &&
                firstNameHash.get(words.get(1)) != null &&
                lastNameHash.get(words.get(2)) != null) return true;
            if (prefixHash.get(words.get(0)) != null &&
                firstNameHash.get(words.get(1)) != null &&
                lastNameHash.get(words.get(2)) != null) return true;
            if (prefixHash.get(words.get(0)) != null &&
                words.get(1).equals(".") &&
                lastNameHash.get(words.get(2)) != null) return true;
        } else if (len == 4) {
            if (firstNameHash.get(words.get(0)) != null &&
                firstNameHash.get(words.get(1)) != null &&
                firstNameHash.get(words.get(2)) != null &&
                lastNameHash.get(words.get(3)) != null) return true;
            if (firstNameHash.get(words.get(0)) != null &&
                words.get(1).length() == 1 &&
                words.get(2).equals(".") &&
                lastNameHash.get(words.get(3)) != null) return true;
            if (prefixHash.get(words.get(0)) != null &&
                firstNameHash.get(words.get(1)) != null &&
                firstNameHash.get(words.get(2)) != null &&
                lastNameHash.get(words.get(3)) != null) return true;
            if (prefixHash.get(words.get(0)) != null &&
                firstNameHash.get(words.get(1)) != null &&
                words.get(2).length()==1 &&
                lastNameHash.get(words.get(3)) != null) return true;
        } else if (len == 5) {
            if (firstNameHash.get(words.get(0)) != null &&
                firstNameHash.get(words.get(1)) != null &&
                words.get(2).length()==1 &&
                words.get(3).equals(".") &&
                lastNameHash.get(words.get(4)) != null) return true;
            if (prefixHash.get(words.get(0)) != null &&
                firstNameHash.get(words.get(1)) != null &&
                words.get(2).length()==1 &&
                words.get(3).equals(".") &&
                lastNameHash.get(words.get(4)) != null) return true;
        }
        return false;
    }

    /**
     *
     */
    public ExtractNames() {
        this("test_data/propername.ser");
    }

    /**
     *
     * @param dataPath
     */
    public ExtractNames(String dataPath) {
        if (lastNameHash != null) return; // static data already loaded
        try {
            InputStream ins =
                this.getClass().getClassLoader().getResourceAsStream(dataPath);
            if (ins == null) {
                ins = this.getClass().getClassLoader().getResourceAsStream(dataPath);
            }
            if (ins == null) {
                ins = new FileInputStream(dataPath);
            }
            if (ins == null) {
                System.out.println("\ncom.knowledgebooks.entity_extraction.Names: failed to open '" + dataPath + "'\n");
                System.exit(1);
            } else {
                ObjectInputStream p = new ObjectInputStream(ins);
                lastNameHash = (Hashtable) p.readObject();
                firstNameHash = (Hashtable) p.readObject();
                placeNameHash = (Hashtable) p.readObject();
                prefixHash = (Hashtable) p.readObject();
                ins.close();
               
                FileOutputStream fos = new FileOutputStream("lastnames.txt");
                OutputStreamWriter out = new OutputStreamWriter(fos);
                Enumeration enum2 = lastNameHash.keys();
                while (enum2.hasMoreElements()) {
                  Object key = enum2.nextElement();
                  out.write(key+"\n");
                }
                out.close();
        // temp: write out hash tables:
           if (true) {
                fos = new FileOutputStream("firstnames.txt");
                out = new OutputStreamWriter(fos);
                enum2 = firstNameHash.keys();
                while (enum2.hasMoreElements()) {
                  Object key = enum2.nextElement();
                  out.write(key+"\n");
                }
                out.close();
               
                fos = new FileOutputStream("placenames.txt");
                out = new OutputStreamWriter(fos);
                enum2 = placeNameHash.keys();
                while (enum2.hasMoreElements()) {
                  Object key2 = enum2.nextElement();
                  String key = "" + key2;
                  if (key.indexOf(';') != -1) key = key.substring(0,key.indexOf(';'));
                  if (key.indexOf('(') != -1) key = key.substring(0,key.indexOf('('));
                  if (key.indexOf(',') != -1) key = key.substring(0,key.indexOf(','));
                  key = key.trim();
                  out.write(key+":" + placeNameHash.get(key) +"\n");
                }
                out.close();
               
                fos = new FileOutputStream("prefixnames.txt");
                out = new OutputStreamWriter(fos);
                enum2 = prefixHash.keys();
                while (enum2.hasMoreElements()) {
                  Object key = enum2.nextElement();
                  out.write(key+"\n");
                }
                out.close();
           }     
            }
        } catch (Exception ee) {
            ee.printStackTrace();
        }
        System.out.println("# last names="+lastNameHash.size()+", # first names="+firstNameHash.size());
    }

    /**
     *
     * @param args
     */
    static public void main(String[] args) {
        ExtractNames extractNames = new ExtractNames();
        // initialize everything, before printing any output - trying to see what is taking so long!
        if (args.length>0) {
            ScoredList[] ret = extractNames.getProperNames(args[0]);
            System.out.println("Human names: " + ret[0].getValuesAsString());
            System.out.println("Place names: " + ret[1].getValuesAsString());
        } else {
            extractNames.isPlaceName("Paris");
            extractNames.isHumanName("President Bush");
            extractNames.isHumanName("President George Bush");
            extractNames.isHumanName("President George W. Bush");
            System.out.println("Initialization complete....");
            System.out.println("Paris: " + extractNames.isPlaceName("Paris"));
            System.out.println("Mexico: " + extractNames.isPlaceName("Mexico"));
            System.out.println("Fresno: " + extractNames.isPlaceName("Fresno"));
            System.out.println("Moscow: " + extractNames.isPlaceName("Moscow"));
            System.out.println("France: " + extractNames.isPlaceName("France"));
            System.out.println("Los Angeles: " + extractNames.isPlaceName("Los Angeles"));
            System.out.println("President Bush: " + extractNames.isHumanName("President Bush"));
            System.out.println("President George Bush: " + extractNames.isHumanName("President George Bush"));
            System.out.println("President George W. Bush: " + extractNames.isHumanName("President George W. Bush"));
            System.out.println("George W. Bush: " + extractNames.isHumanName("George W. Bush"));
            System.out.println("Senator Barbara Boxer: " + extractNames.isHumanName("Senator Barbara Boxer"));
            System.out.println("King Smith: " + extractNames.isHumanName("King Smith"));
            ScoredList[] ret = extractNames.getProperNames("George Bush played golf. President George W. Bush went to London England, Paris France and Mexico to see Mary Smith in Moscow. President Bush will return home Monday.");
            System.out.println("Human names: " + ret[0].getValuesAsString());
            System.out.println("Place names: " + ret[1].getValuesAsString());
            System.out.println("\n\n\n");
           
            // for book example:
            ExtractNames names = new ExtractNames();
            System.out.println("Los Angeles: " +
                names.isPlaceName("Los Angeles"));
    System.out.println("President Bush: " +
                names.isHumanName("President Bush"));          
    System.out.println("President George Bush: " +
           names.isHumanName("President George Bush"));
    System.out.println("President George W. Bush: " +
           names.isHumanName("President George W. Bush"));
    ScoredList[] ret1 = names.getProperNames(
          "George Bush played golf. President  George W. Bush went to London England, Paris France and Mexico to see Mary  Smith in Moscow. President Bush will return home Monday.");
    System.out.println("Human names: " +
                       ret1[0].getValuesAsString());
    System.out.println("Place names: " +
                       ret1[1].getValuesAsString());
        }
    }

    static Hashtable lastNameHash = null;
    static Hashtable firstNameHash = null;
    static Hashtable placeNameHash = null; // cache for database access
    static Hashtable prefixHash = null;

}
TOP

Related Classes of nlp.com.knowledgebooks.nlp.ExtractNames

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.