// Copyright 2003-2005. Mark Watson (markw@markwatson.com). All rights reserved.
// This software is released under the LGPL (www.fsf.org)
// THIS SOFTWARE COMES WITH NO WARRANTY
package com.markwatson.nlp.propernames;
import com.markwatson.data.ScoredList;
import java.util.*;
import java.io.*;
public class Names {
/**
* Facade method: get all place and human names from a text string
*/
public ScoredList[] getProperNames(String [] words) {
ScoredList placeNames = new ScoredList();
ScoredList humanNames = new ScoredList();
ScoredList[] ret = new ScoredList[2];
ret[0] = humanNames; ret[1] = placeNames;
if (words == null) return ret;
for (int i=0; i<words.length; i++) {
// 5 word human names:
if (isHumanName(words, i, 5)) {
String s = words[i] + " " + words[i+1] + " " + words[i+2] + " " + words[i+3] + " " + words[i+4];
humanNames.addValue(s);
i += 4;
continue;
}
// 4 word human names:
if (isHumanName(words, i, 4)) {
String s = words[i] + " " + words[i+1] + " " + words[i+2] + " " + words[i+3];
humanNames.addValue(s);
i += 3;
continue;
}
// 3 word names:
if (isPlaceName(words, i, 3)) {
String s = words[i] + " " + words[i+1] + " " + words[i+2];
placeNames.addValue(s);
i += 2;
continue;
}
if (isHumanName(words, i, 3)) {
String s = words[i] + " " + words[i+1] + " " + words[i+2];
humanNames.addValue(s);
i += 2;
continue;
}
// 2 word names:
if (isPlaceName(words, i, 2)) {
String s = words[i] + " " + words[i+1];
placeNames.addValue(s);
i += 1;
continue;
}
if (isHumanName(words, i, 2)) {
String s = words[i] + " " + words[i+1];
humanNames.addValue(s);
i += 1;
continue;
}
// 1 word names:
if (isPlaceName(words, i, 1)) {
placeNames.addValue(words[i]);
continue;
}
}
return ret;
}
public ScoredList[] getProperNames(String s) {
String [] words = com.knowledgebooks.nlp.util.Tokenizer.wordsToArray(s);
return getProperNames(words);
}
public boolean isPlaceName(String [] words, int startIndex, int numWords) {
if ((startIndex + numWords) > words.length) return false;
if (numWords == 1) return isPlaceName(words[startIndex]);
String s = "";
for (int i=startIndex; i<(startIndex + numWords); i++) {
if (i < (startIndex + numWords - 1)) s = s + words[startIndex] + " ";
else s = s + words[startIndex];
}
return isPlaceName(s);
}
public boolean isPlaceName(String name) {
return placeNameHash.get(name) != null;
}
public boolean isHumanName(String s) {
String [] ss = com.knowledgebooks.nlp.util.Tokenizer.wordsToArray(s);
//System.out.print("Tokens: "); for (int i=0; i<ss.length; i++) System.out.print(ss[i] + " "); System.out.println();
if (ss == null) return false;
return isHumanName(ss);
}
static private String s1[] = new String[1];
static private String s2[] = new String[2];
static private String s3[] = new String[3];
static private String s4[] = new String[4];
static private String s5[] = new String[5];
public boolean isHumanName(String[] words, int index, int numWords) {
if ((index + numWords) > words.length) return false;
if (numWords == 1) {
s1[0] = words[index]; return isHumanName(s1);
}
if (numWords == 2) {
s2[0] = words[index]; s2[1] = words[index+1]; return isHumanName(s2);
}
if (numWords == 3) {
s3[0] = words[index]; s3[1] = words[index+1]; s3[2] = words[index+2]; return isHumanName(s3);
}
if (numWords == 4) {
s4[0] = words[index]; s4[1] = words[index+1]; s4[2] = words[index+2]; s4[3] = words[index+3]; return isHumanName(s4);
}
if (numWords == 5) {
s5[0] = words[index]; s5[1] = words[index+1]; s5[2] = words[index+2]; s5[3] = words[index+3]; s5[4] = words[index+4]; return isHumanName(s5);
}
return false;
}
public boolean isHumanName(String[] words) {
int len = words.length;
if (len == 1) {
if (lastNameHash.get(words[0]) != null) return true;
} else if (len == 2) {
if (firstNameHash.get(words[0]) != null && lastNameHash.get(words[1]) != null) return true;
if (prefixHash.get(words[0]) != null && lastNameHash.get(words[1]) != null) return true;
} else if (len == 3) {
if (firstNameHash.get(words[0]) != null &&
firstNameHash.get(words[1]) != null &&
lastNameHash.get(words[2]) != null) return true;
if (prefixHash.get(words[0]) != null &&
firstNameHash.get(words[1]) != null &&
lastNameHash.get(words[2]) != null) return true;
if (prefixHash.get(words[0]) != null &&
words[1].equals(".") &&
lastNameHash.get(words[2]) != null) return true;
} else if (len == 4) {
if (firstNameHash.get(words[0]) != null &&
firstNameHash.get(words[1]) != null &&
firstNameHash.get(words[2]) != null &&
lastNameHash.get(words[3]) != null) return true;
if (firstNameHash.get(words[0]) != null &&
words[1].length() == 1 &&
words[2].equals(".") &&
lastNameHash.get(words[3]) != null) return true;
if (prefixHash.get(words[0]) != null &&
firstNameHash.get(words[1]) != null &&
firstNameHash.get(words[2]) != null &&
lastNameHash.get(words[3]) != null) return true;
if (prefixHash.get(words[0]) != null &&
firstNameHash.get(words[1]) != null &&
words[2].length()==1 &&
lastNameHash.get(words[3]) != null) return true;
} else if (len == 5) {
if (firstNameHash.get(words[0]) != null &&
firstNameHash.get(words[1]) != null &&
words[2].length()==1 &&
words[3].equals(".") &&
lastNameHash.get(words[4]) != null) return true;
if (prefixHash.get(words[0]) != null &&
firstNameHash.get(words[1]) != null &&
words[2].length()==1 &&
words[3].equals(".") &&
lastNameHash.get(words[4]) != null) return true;
}
return false;
}
public Names() {
this("data/propername/propername.ser");
}
public Names(String dataPath) {
if (lastNameHash != null) return; // static data already loaded
try {
InputStream ins =
this.getClass().getClassLoader().getResourceAsStream(dataPath);
if (ins == null) {
ins = this.getClass().getClassLoader().getResourceAsStream(dataPath);
}
if (ins == null) {
ins = new FileInputStream(dataPath);
}
if (ins == null) {
System.out.println("\ncom.markwatson.nlp.propernames.Names: failed to open '" + dataPath + "'\n");
System.exit(1);
} else {
ObjectInputStream p = new ObjectInputStream(ins);
lastNameHash = (Hashtable) p.readObject();
firstNameHash = (Hashtable) p.readObject();
placeNameHash = (Hashtable) p.readObject();
prefixHash = (Hashtable) p.readObject();
ins.close();
}
} catch (Exception ee) {
ee.printStackTrace();
}
}
static public void main(String[] args) {
Names names = new Names();
// initialize everything, before printing any output - trying to see what is taking so long!
if (args.length>0) {
ScoredList[] ret = names.getProperNames(args[0]);
System.out.println("Human names: " + ret[0].getValuesAsString());
System.out.println("Place names: " + ret[1].getValuesAsString());
} else {
names.isPlaceName("Paris");
names.isHumanName("President Bush");
names.isHumanName("President George Bush");
names.isHumanName("President George W. Bush");
System.out.println("Initialization complete....");
System.out.println("Paris: " + names.isPlaceName("Paris"));
System.out.println("Mexico: " + names.isPlaceName("Mexico"));
System.out.println("Fresno: " + names.isPlaceName("Fresno"));
System.out.println("Moscow: " + names.isPlaceName("Moscow"));
System.out.println("France: " + names.isPlaceName("France"));
System.out.println("Los Angeles: " + names.isPlaceName("Los Angeles"));
System.out.println("President Bush: " + names.isHumanName("President Bush"));
System.out.println("President George Bush: " + names.isHumanName("President George Bush"));
System.out.println("President George W. Bush: " + names.isHumanName("President George W. Bush"));
System.out.println("George W. Bush: " + names.isHumanName("George W. Bush"));
System.out.println("Senator Barbara Boxer: " + names.isHumanName("Senator Barbara Boxer"));
System.out.println("King Smith: " + names.isHumanName("King Smith"));
ScoredList[] ret = names.getProperNames("George Bush played golf. President George W. Bush went to London England, Paris France and Mexico to see Mary Smith in Moscow. President Bush will return home Monday.");
System.out.println("Human names: " + ret[0].getValuesAsString());
System.out.println("Place names: " + ret[1].getValuesAsString());
}
}
static Hashtable lastNameHash = null;
static Hashtable firstNameHash = null;
static Hashtable placeNameHash = null; // cache for database access
static Hashtable prefixHash = null;
}