Package org.sf.mustru.utils

Examples of org.sf.mustru.utils.StandardBgramAnalyzer


  {
   System.out.println("Started");
//  String[] stopwords = {};
// WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer();
//  StandardAnalyzer analyzer = new StandardAnalyzer();
    StandardBgramAnalyzer analyzer = new StandardBgramAnalyzer();
    String str = "Alice was 1987 and 233,999.145$ beginning to get very tired of sitting-by-her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?'";
    str = "We were on the east flank of Mt. Kilimanjaro -- at 19,342 feet the highest mountain in Africa. In the pre-dawn hours I locked my eyes onto the";;
//  String dirname = "/home/manuk/mustru_bak/test/data/tcat_testing/coffee";
  //String filename = "/home/manuk/html/junk/LA122489-0101.txt";
   // String str = Files.readFromFile( new File(filename) );
View Full Code Here


  synBuffer.append( (verbs.size() > 0) ? wnetTools.getSynonyms(verbs.get(0), "v"):""); synBuffer.append(" ");
  synBuffer.append( (adjectives.size() > 0) ? wnetTools.getSynonyms(adjectives.get(0), "a"):"");
  String[] synonyms = synBuffer.toString().trim().split(" ");
*/
  //*-- tokenize the question
  StandardBgramAnalyzer analyzer = new StandardBgramAnalyzer(); analyzer.setExtractEntities(true);
  TokenStream stream = analyzer.tokenStream("contents", new StringReader(question));
  ArrayList<Token> tokenList = new ArrayList<Token>(); Token token = null;
  entities = new ArrayList<String>();    //*-- list of entities in the question
  while ( (token = stream.next()) != null)
   { tokenList.add(token); if (token.type().equals("<ENTITY>")) entities.add(token.termText()); }
  //*-------------------------------------------------------------------
  //*-- build the query with the five components
  //*--
  //*-- 1. First identify the entity types for the query
  //*-------------------------------------------------------------------
  StringBuffer queryString = new StringBuffer();
  NumberFormat nf = NumberFormat.getInstance();
  nf.setMaximumIntegerDigits(3); nf.setMaximumFractionDigits(4);
  float wt = WT_QTYPE;      //*--- Weight for question type entities
  BooleanQuery theQuery = new BooleanQuery();
  LOOP: for (int i = 0; i < tokenList.size(); i++)
  {
   //*-- first try two word query tokens and then single word tokens
   String etype = null;
   if (i > 0) etype = qhash.get( tokenList.get(i - 1).termText() + " " + tokenList.get(i).termText() );
   if ( (etype == null) || (etype.length() < 2)) etype = qhash.get( tokenList.get(i).termText() );
    
   if ( (etype != null) && (etype.length() > 2) )
    { String[] etypes = etype.split("OR");
      for (int j = 0; j < etypes.length; j++)
      { queryString.append("contents:" + etypes[j].trim() + "^" + nf.format(wt) + " ");
        TermQuery tq = new TermQuery( new Term("contents", etypes[j])); tq.setBoost(wt);
        theQuery.add(tq, BooleanClause.Occur.SHOULD);
        entities.add(etypes[j]);
      }
     break LOOP;
    }
   }
  
  //*-------------------------------------------
  //*-- 2. Find entities in the question words
  //*-------------------------------------------
  wt = WT_ENTITY;
  for (int i = 0; i < tokenList.size(); i++)
  { if ( tokenList.get(i).type().equals("ENTITY") )  
    { String qword = tokenList.get(i).termText();
      queryString.append("contents:" + qword + "^" + nf.format(wt) + " ");
      TermQuery tq = new TermQuery( new Term("contents", qword)); tq.setBoost(wt);
      theQuery.add(tq, BooleanClause.Occur.SHOULD);
    }
  }
 
  //*-------------------------------------------------------------------------------
  //*-- 3. Create a list of weighted trigrams/bigrams/unigrams from the query
  //*-------------------------------------------------------------------------------
  int numNouns = nouns.size(); int numVerbs = verbs.size(); int numAdjectives = adjectives.size();
  String[] queryWords = question.split("\\s+"); int wordsLength = queryWords.length;
  boolean[] contentWord = new boolean[wordsLength];
  for (int i = 0; i < wordsLength; i++)
   { queryWords[i] = queryWords[i].toLowerCase(Constants.locale);
     contentWord[i] = false;
     for (int j = 0; j < nouns.size(); j++) if (queryWords[i].equalsIgnoreCase(nouns.get(j))) contentWord[i] = true;
     for (int j = 0; j < verbs.size(); j++) if (queryWords[i].equalsIgnoreCase(verbs.get(j))) contentWord[i] = true;
     for (int j = 0; j < adjectives.size(); j++) if (queryWords[i].equalsIgnoreCase(adjectives.get(j))) contentWord[i] = true;
   }
 
  String joinChar; 
  //*-- generate all possible bigrams with higher weights for bigrams that do not have stopwords
  float WT_NORM_BIGRAM = WT_BIGRAM;
  for (int i = 1; i < 4; i++) if (wordsLength > (Math.pow(2, (i + 1)))) WT_NORM_BIGRAM /= 2;
  LOOP2: for (int i = 1; i < wordsLength; i++)
  { 
   //*-- skip if the previous word was a question word
   //*-- if the previous word was a stop word use a underscore to build the bigram, otherwise use a space
   wt = 0;
   if ( !questionWords.contains(queryWords[i-1]) )
   {
     if (stopWords.contains(queryWords[i-1]) && stopWords.contains(queryWords[i])) continue LOOP2;
     joinChar = (stopWords.contains(queryWords[i-1]) || stopWords.contains(queryWords[i])) ? "_": " ";
     for (int j = i-1; j < i+1; j++) wt += (contentWord[j]) ? WT_NORM_BIGRAM: 0;
     String bigram = queryWords[i-1] + joinChar + queryWords[i];
     queryString.append("contents:\"" + bigram + "\"~0^" + wt + " ");
     PhraseQuery pq = new PhraseQuery(); pq.add( new Term("contents", bigram)); pq.setBoost(wt); pq.setSlop(0);
     theQuery.add(pq, BooleanClause.Occur.SHOULD);
     bigrams.add(bigram);
   }
  } //*-- end of for
 
  //*-- create unigrams from non-stop words and weigh unigrams near the start of the question
  //*-- higher than unigrams near the end of the question
  LOOP3: for (int i = 0; i < wordsLength; i++)
  { wt = WT_UNIGRAM;
 
    //*-- skip punctuation and very short words
    if ( (queryWords[i].length() < 2|| (!contentWord[i]) ) continue LOOP3;
   
    wt *=  ( (numNouns > 0) && (nouns.get(0).equalsIgnoreCase(queryWords[i])) ) ? 8:
           ( (numNouns > 1) && (nouns.get(1).equalsIgnoreCase(queryWords[i])) ) ? 4: 1;
    wt *=  ( (numVerbs > 0) && (verbs.get(0).equalsIgnoreCase(queryWords[i])) ) ? 4:
           ( (numVerbs > 1) && (verbs.get(1).equalsIgnoreCase(queryWords[i])) ) ? 2: 1;
    wt *=  ( (numAdjectives > 0) && (adjectives.get(0).equalsIgnoreCase(queryWords[i])) ) ? 4:
           ( (numAdjectives > 1) && (adjectives.get(1).equalsIgnoreCase(queryWords[i])) ) ? 2: 1;
  
   queryString.append("contents:" + queryWords[i] + "^" + nf.format(wt) + " ");
   TermQuery tq = new TermQuery( new Term("contents", queryWords[i])); tq.setBoost(wt);
   theQuery.add(tq, BooleanClause.Occur.SHOULD);
  } //*-- end of for

  //*--------------------------------------------------------------------------
  //*-- 4. Add the query transformation for the part. query type and add the synonyms
  //*--------------------------------------------------------------------------
/*  wt = WT_SYNONYMS;
  for (int j = 0; j < synonyms.length; j++)
  { queryString.append("contents:" + synonyms[j] + "^" + nf.format(wt) + " ");
    TermQuery tq = new TermQuery( new Term("contents", synonyms[j])); tq.setBoost(wt);
    theQuery.add(tq, BooleanClause.Occur.SHOULD);
  }
  */
  wt = WT_TRANSFORM;
  Matcher matcher = whatPattern.matcher(question);
  if ( (matcher.matches()) && (nouns.size() > 0) )
  {  String qTransform = "\"" + nouns.get(0) + "_is" + "\"";
     queryString.append("contents:" + qTransform + "^" + nf.format(wt) + " ");
     TermQuery tq = new TermQuery( new Term("contents", qTransform)); tq.setBoost(wt);
     theQuery.add(tq, BooleanClause.Occur.SHOULD);
     qTransform = "\"" + nouns.get(0) + "_was" + "\"";
     queryString.append("contents:" + qTransform + "^" + nf.format(wt) + " ");
     tq = new TermQuery( new Term("contents", qTransform)); tq.setBoost(wt);
     theQuery.add(tq, BooleanClause.Occur.SHOULD);
  }
 
  matcher = wherePattern.matcher(question);
  if ( (matcher.matches()) && (nouns.size() > 0) )
  {  String qTransform = "is_located" + "\"";
     queryString.append("contents:" + qTransform + "^" + nf.format(wt) + " ");
     TermQuery tq = new TermQuery( new Term("contents", qTransform)); tq.setBoost(wt);
     theQuery.add(tq, BooleanClause.Occur.SHOULD);
     qTransform = "\"located_at\"";
     queryString.append("contents:" + qTransform + "^" + nf.format(wt) + " ");
     tq = new TermQuery( new Term("contents", qTransform)); tq.setBoost(wt);
     theQuery.add(tq, BooleanClause.Occur.SHOULD);
  }
 
//  String query = queryString.toString();
//System.out.println("query string " + query);
//System.out.println("gen q: " + theQuery);

  analyzer.setExtractEntities(false);
  QueryParser qp = new QueryParser("contents", analyzer)
  try { return(qp.parse(queryString.toString()) ); }
  catch(ParseException pe) { }
 
  return(theQuery);
View Full Code Here

public SearchQuery()
{ PropertyConfigurator.configure (Constants.LOG4J_FILE);
   logger = Logger.getLogger(SearchQuery.class.getName());
  
   //*-- get the index searcher
   try { is = SearchTools.getSearcher(Constants.getINDEXDIR(), false); bgramAnalyzer = new StandardBgramAnalyzer()}
   catch (IOException ie) { logger.error("Problem with opening the Lucene index directory " + ie.getMessage()); }
    
   //*-- read the spell checker model
   try { readModel(Constants.SPELL_CHECK_MODEL); }
   catch (IOException ie) { logger.error("Could not read the spell checker file " + ie.getMessage()); }
View Full Code Here

  boolean freshIndex = (crawlConfig.getFreshIndex() == -1) ? freshCrawl:
         (crawlConfig.getFreshIndex() ==  0) ? false: true;
  try 
  {
   fsd = FSDirectory.getDirectory(new File(Constants.getINDEXDIR()), freshIndex);
   analyzer = new StandardBgramAnalyzer(); iw = new IndexWriter(fsd, analyzer, freshIndex);
   iw.setSimilarity(new SearchSimilarity());
   ctRef.setIw(iw);
  }
  catch (IOException ie) { ctRef.cleanUp("Could not get IndexWriter " + ie.getMessage() ); }
View Full Code Here

  tempFile = null;        fileIW = caller.getIw();                 
  cdoc = new ClassifyDoc();      docsProcessed = 0;  
  dbt = Constants.getDbt();

  //*-- create the RAM based Lucene IndexWriter
  analyzer = new StandardBgramAnalyzer(); analyzer.setExtractEntities(true);
  ramDir = new RAMDirectory(); //Similarity.setDefault(new SearchSimilarity());
  createRamIW();
  initTime +=  new Date().getTime();
}
View Full Code Here

TOP

Related Classes of org.sf.mustru.utils.StandardBgramAnalyzer

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.