synBuffer.append( (verbs.size() > 0) ? wnetTools.getSynonyms(verbs.get(0), "v"):""); synBuffer.append(" ");
synBuffer.append( (adjectives.size() > 0) ? wnetTools.getSynonyms(adjectives.get(0), "a"):"");
String[] synonyms = synBuffer.toString().trim().split(" ");
//*-- tokenize the question
StandardBgramAnalyzer analyzer = new StandardBgramAnalyzer(); analyzer.setExtractEntities(true);
TokenStream stream = analyzer.tokenStream("contents", new StringReader(question));
ArrayList<Token> tokenList = new ArrayList<Token>(); Token token = null;
entities = new ArrayList<String>(); //*-- list of entities in the question
while ( (token = != null)
{ tokenList.add(token); if (token.type().equals("<ENTITY>")) entities.add(token.termText()); }
//*-- build the query with the five components
//*-- 1. First identify the entity types for the query
StringBuffer queryString = new StringBuffer();
NumberFormat nf = NumberFormat.getInstance();
nf.setMaximumIntegerDigits(3); nf.setMaximumFractionDigits(4);
float wt = WT_QTYPE; //*--- Weight for question type entities
BooleanQuery theQuery = new BooleanQuery();
LOOP: for (int i = 0; i < tokenList.size(); i++)
//*-- first try two word query tokens and then single word tokens
String etype = null;
if (i > 0) etype = qhash.get( tokenList.get(i - 1).termText() + " " + tokenList.get(i).termText() );
if ( (etype == null) || (etype.length() < 2)) etype = qhash.get( tokenList.get(i).termText() );
if ( (etype != null) && (etype.length() > 2) )
{ String[] etypes = etype.split("OR");
for (int j = 0; j < etypes.length; j++)
{ queryString.append("contents:" + etypes[j].trim() + "^" + nf.format(wt) + " ");
TermQuery tq = new TermQuery( new Term("contents", etypes[j])); tq.setBoost(wt);
theQuery.add(tq, BooleanClause.Occur.SHOULD);
break LOOP;
//*-- 2. Find entities in the question words
for (int i = 0; i < tokenList.size(); i++)
{ if ( tokenList.get(i).type().equals("ENTITY") )
{ String qword = tokenList.get(i).termText();
queryString.append("contents:" + qword + "^" + nf.format(wt) + " ");
TermQuery tq = new TermQuery( new Term("contents", qword)); tq.setBoost(wt);
theQuery.add(tq, BooleanClause.Occur.SHOULD);
//*-- 3. Create a list of weighted trigrams/bigrams/unigrams from the query
int numNouns = nouns.size(); int numVerbs = verbs.size(); int numAdjectives = adjectives.size();
String[] queryWords = question.split("\\s+"); int wordsLength = queryWords.length;
boolean[] contentWord = new boolean[wordsLength];
for (int i = 0; i < wordsLength; i++)
{ queryWords[i] = queryWords[i].toLowerCase(Constants.locale);
contentWord[i] = false;
for (int j = 0; j < nouns.size(); j++) if (queryWords[i].equalsIgnoreCase(nouns.get(j))) contentWord[i] = true;
for (int j = 0; j < verbs.size(); j++) if (queryWords[i].equalsIgnoreCase(verbs.get(j))) contentWord[i] = true;
for (int j = 0; j < adjectives.size(); j++) if (queryWords[i].equalsIgnoreCase(adjectives.get(j))) contentWord[i] = true;
String joinChar;
//*-- generate all possible bigrams with higher weights for bigrams that do not have stopwords
for (int i = 1; i < 4; i++) if (wordsLength > (Math.pow(2, (i + 1)))) WT_NORM_BIGRAM /= 2;
LOOP2: for (int i = 1; i < wordsLength; i++)
//*-- skip if the previous word was a question word
//*-- if the previous word was a stop word use a underscore to build the bigram, otherwise use a space
wt = 0;
if ( !questionWords.contains(queryWords[i-1]) )
if (stopWords.contains(queryWords[i-1]) && stopWords.contains(queryWords[i])) continue LOOP2;
joinChar = (stopWords.contains(queryWords[i-1]) || stopWords.contains(queryWords[i])) ? "_": " ";
for (int j = i-1; j < i+1; j++) wt += (contentWord[j]) ? WT_NORM_BIGRAM: 0;
String bigram = queryWords[i-1] + joinChar + queryWords[i];
queryString.append("contents:\"" + bigram + "\"~0^" + wt + " ");
PhraseQuery pq = new PhraseQuery(); pq.add( new Term("contents", bigram)); pq.setBoost(wt); pq.setSlop(0);
theQuery.add(pq, BooleanClause.Occur.SHOULD);
} //*-- end of for
//*-- create unigrams from non-stop words and weigh unigrams near the start of the question
//*-- higher than unigrams near the end of the question
LOOP3: for (int i = 0; i < wordsLength; i++)
{ wt = WT_UNIGRAM;
//*-- skip punctuation and very short words
if ( (queryWords[i].length() < 2) || (!contentWord[i]) ) continue LOOP3;
wt *= ( (numNouns > 0) && (nouns.get(0).equalsIgnoreCase(queryWords[i])) ) ? 8:
( (numNouns > 1) && (nouns.get(1).equalsIgnoreCase(queryWords[i])) ) ? 4: 1;
wt *= ( (numVerbs > 0) && (verbs.get(0).equalsIgnoreCase(queryWords[i])) ) ? 4:
( (numVerbs > 1) && (verbs.get(1).equalsIgnoreCase(queryWords[i])) ) ? 2: 1;
wt *= ( (numAdjectives > 0) && (adjectives.get(0).equalsIgnoreCase(queryWords[i])) ) ? 4:
( (numAdjectives > 1) && (adjectives.get(1).equalsIgnoreCase(queryWords[i])) ) ? 2: 1;
queryString.append("contents:" + queryWords[i] + "^" + nf.format(wt) + " ");
TermQuery tq = new TermQuery( new Term("contents", queryWords[i])); tq.setBoost(wt);
theQuery.add(tq, BooleanClause.Occur.SHOULD);
} //*-- end of for
//*-- 4. Add the query transformation for the part. query type and add the synonyms
/* wt = WT_SYNONYMS;
for (int j = 0; j < synonyms.length; j++)
{ queryString.append("contents:" + synonyms[j] + "^" + nf.format(wt) + " ");
TermQuery tq = new TermQuery( new Term("contents", synonyms[j])); tq.setBoost(wt);
theQuery.add(tq, BooleanClause.Occur.SHOULD);
Matcher matcher = whatPattern.matcher(question);
if ( (matcher.matches()) && (nouns.size() > 0) )
{ String qTransform = "\"" + nouns.get(0) + "_is" + "\"";
queryString.append("contents:" + qTransform + "^" + nf.format(wt) + " ");
TermQuery tq = new TermQuery( new Term("contents", qTransform)); tq.setBoost(wt);
theQuery.add(tq, BooleanClause.Occur.SHOULD);
qTransform = "\"" + nouns.get(0) + "_was" + "\"";
queryString.append("contents:" + qTransform + "^" + nf.format(wt) + " ");
tq = new TermQuery( new Term("contents", qTransform)); tq.setBoost(wt);
theQuery.add(tq, BooleanClause.Occur.SHOULD);
matcher = wherePattern.matcher(question);
if ( (matcher.matches()) && (nouns.size() > 0) )
{ String qTransform = "is_located" + "\"";
queryString.append("contents:" + qTransform + "^" + nf.format(wt) + " ");
TermQuery tq = new TermQuery( new Term("contents", qTransform)); tq.setBoost(wt);
theQuery.add(tq, BooleanClause.Occur.SHOULD);
qTransform = "\"located_at\"";
queryString.append("contents:" + qTransform + "^" + nf.format(wt) + " ");
tq = new TermQuery( new Term("contents", qTransform)); tq.setBoost(wt);
theQuery.add(tq, BooleanClause.Occur.SHOULD);
// String query = queryString.toString();
//System.out.println("query string " + query);
//System.out.println("gen q: " + theQuery);
QueryParser qp = new QueryParser("contents", analyzer);
try { return(qp.parse(queryString.toString()) ); }
catch(ParseException pe) { }