Examples of org.sf.mustru.utils.StandardBgramAnalyzer

org.sf.mustru.utils.StandardBgramAnalyzer
A bigram analyzer that uses an inner bgram filter class to create bigrams that start or end with stop words. This is somewhat similar to the Nutch analyzer. The bigram includes a "_' separator character. So, the string -

Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do

will generate the following token list
1: [alice: ALPHANUM] 0:5 [alice_was: BIGRAM] 0:9
2: [was_beginning: BIGRAM] 6:19 [beginning: ALPHANUM] 10:19 [beginning_to: BIGRAM] 10:22
3: [to_get: BIGRAM] 20:26 [get: ALPHANUM] 23:26
4: [very: ALPHANUM] 27:31
5: [tired: ALPHANUM] 32:37 [tired_of: BIGRAM] 32:40
6: [of_sitting: BIGRAM] 38:48 [sitting: ALPHANUM] 41:48 [sitting_by: BIGRAM] 41:51
7: [by_her: BIGRAM] 49:55 [her: ALPHANUM] 52:55
8: [sister: ALPHANUM] 56:62 [sister_on: BIGRAM] 56:65
9: [on_the: BIGRAM] 63:69
10: [the_bank: BIGRAM] 66:74 [bank: ALPHANUM] 70:74 [bank_and: BIGRAM] 70:79
11: [and_of: BIGRAM] 76:82
12: [of_having: BIGRAM] 80:89 [having: ALPHANUM] 83:89
13: [nothing: ALPHANUM] 90:97 [nothing_to: BIGRAM] 90:100
14: [to_do: BIGRAM] 98:103 [do: ALPHANUM] 101:103

  {
   System.out.println("Started");
//  String[] stopwords = {};
// WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer();
//  StandardAnalyzer analyzer = new StandardAnalyzer();
    StandardBgramAnalyzer analyzer = new StandardBgramAnalyzer();
    String str = "Alice was 1987 and 233,999.145$ beginning to get very tired of sitting-by-her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?'";
    str = "We were on the east flank of Mt. Kilimanjaro -- at 19,342 feet the highest mountain in Africa. In the pre-dawn hours I locked my eyes onto the";;
//  String dirname = "/home/manuk/mustru_bak/test/data/tcat_testing/coffee";
  //String filename = "/home/manuk/html/junk/LA122489-0101.txt";
   // String str = Files.readFromFile( new File(filename) );

View Full Code Here

  synBuffer.append( (verbs.size() > 0) ? wnetTools.getSynonyms(verbs.get(0), "v"):""); synBuffer.append(" ");
  synBuffer.append( (adjectives.size() > 0) ? wnetTools.getSynonyms(adjectives.get(0), "a"):"");
  String[] synonyms = synBuffer.toString().trim().split(" ");
 */ 
  //*-- tokenize the question
  StandardBgramAnalyzer analyzer = new StandardBgramAnalyzer(); analyzer.setExtractEntities(true);
  TokenStream stream = analyzer.tokenStream("contents", new StringReader(question));
  ArrayList<Token> tokenList = new ArrayList<Token>(); Token token = null;
  entities = new ArrayList<String>();    //*-- list of entities in the question
  while ( (token = stream.next()) != null) 
   { tokenList.add(token); if (token.type().equals("<ENTITY>")) entities.add(token.termText()); }
 
  //*-------------------------------------------------------------------
  //*-- build the query with the five components
  //*--
  //*-- 1. First identify the entity types for the query
  //*-------------------------------------------------------------------
  StringBuffer queryString = new StringBuffer();
  NumberFormat nf = NumberFormat.getInstance();
  nf.setMaximumIntegerDigits(3); nf.setMaximumFractionDigits(4);
  float wt = WT_QTYPE;      //*--- Weight for question type entities
  BooleanQuery theQuery = new BooleanQuery();
  LOOP: for (int i = 0; i < tokenList.size(); i++)
  {
   //*-- first try two word query tokens and then single word tokens
   String etype = null;
   if (i > 0) etype = qhash.get( tokenList.get(i - 1).termText() + " " + tokenList.get(i).termText() ); 
   if ( (etype == null) || (etype.length() < 2)) etype = qhash.get( tokenList.get(i).termText() );
     
   if ( (etype != null) && (etype.length() > 2) ) 
    { String[] etypes = etype.split("OR");
      for (int j = 0; j < etypes.length; j++)
      { queryString.append("contents:" + etypes[j].trim() + "^" + nf.format(wt) + " ");
        TermQuery tq = new TermQuery( new Term("contents", etypes[j])); tq.setBoost(wt);
        theQuery.add(tq, BooleanClause.Occur.SHOULD);
        entities.add(etypes[j]); 
      }
     break LOOP;
    }
   }
   
  //*-------------------------------------------
  //*-- 2. Find entities in the question words 
  //*-------------------------------------------
  wt = WT_ENTITY;
  for (int i = 0; i < tokenList.size(); i++)
  { if ( tokenList.get(i).type().equals("ENTITY") )   
    { String qword = tokenList.get(i).termText();
      queryString.append("contents:" + qword + "^" + nf.format(wt) + " ");
      TermQuery tq = new TermQuery( new Term("contents", qword)); tq.setBoost(wt);
      theQuery.add(tq, BooleanClause.Occur.SHOULD);
    }
  }
  
  //*-------------------------------------------------------------------------------
  //*-- 3. Create a list of weighted trigrams/bigrams/unigrams from the query 
  //*-------------------------------------------------------------------------------
  int numNouns = nouns.size(); int numVerbs = verbs.size(); int numAdjectives = adjectives.size();
  String[] queryWords = question.split("\\s+"); int wordsLength = queryWords.length;
  boolean[] contentWord = new boolean[wordsLength];
  for (int i = 0; i < wordsLength; i++) 
   { queryWords[i] = queryWords[i].toLowerCase(Constants.locale);
     contentWord[i] = false; 
     for (int j = 0; j < nouns.size(); j++) if (queryWords[i].equalsIgnoreCase(nouns.get(j))) contentWord[i] = true;
     for (int j = 0; j < verbs.size(); j++) if (queryWords[i].equalsIgnoreCase(verbs.get(j))) contentWord[i] = true;
     for (int j = 0; j < adjectives.size(); j++) if (queryWords[i].equalsIgnoreCase(adjectives.get(j))) contentWord[i] = true;
   }
  
  String joinChar;  
  //*-- generate all possible bigrams with higher weights for bigrams that do not have stopwords
  float WT_NORM_BIGRAM = WT_BIGRAM;
  for (int i = 1; i < 4; i++) if (wordsLength > (Math.pow(2, (i + 1)))) WT_NORM_BIGRAM /= 2; 
  LOOP2: for (int i = 1; i < wordsLength; i++)
  {  
   //*-- skip if the previous word was a question word
   //*-- if the previous word was a stop word use a underscore to build the bigram, otherwise use a space
   wt = 0; 
   if ( !questionWords.contains(queryWords[i-1]) )
   { 
     if (stopWords.contains(queryWords[i-1]) && stopWords.contains(queryWords[i])) continue LOOP2;
     joinChar = (stopWords.contains(queryWords[i-1]) || stopWords.contains(queryWords[i])) ? "_": " ";
     for (int j = i-1; j < i+1; j++) wt += (contentWord[j]) ? WT_NORM_BIGRAM: 0;
     String bigram = queryWords[i-1] + joinChar + queryWords[i];
     queryString.append("contents:\"" + bigram + "\"~0^" + wt + " ");
     PhraseQuery pq = new PhraseQuery(); pq.add( new Term("contents", bigram)); pq.setBoost(wt); pq.setSlop(0);
     theQuery.add(pq, BooleanClause.Occur.SHOULD);
     bigrams.add(bigram);
   }
  } //*-- end of for
  
  //*-- create unigrams from non-stop words and weigh unigrams near the start of the question
  //*-- higher than unigrams near the end of the question
  LOOP3: for (int i = 0; i < wordsLength; i++)
  { wt = WT_UNIGRAM;
  
    //*-- skip punctuation and very short words
    if ( (queryWords[i].length() < 2)  || (!contentWord[i]) ) continue LOOP3;
    
    wt *=  ( (numNouns > 0) && (nouns.get(0).equalsIgnoreCase(queryWords[i])) ) ? 8:
           ( (numNouns > 1) && (nouns.get(1).equalsIgnoreCase(queryWords[i])) ) ? 4: 1;
    wt *=  ( (numVerbs > 0) && (verbs.get(0).equalsIgnoreCase(queryWords[i])) ) ? 4:
           ( (numVerbs > 1) && (verbs.get(1).equalsIgnoreCase(queryWords[i])) ) ? 2: 1;
    wt *=  ( (numAdjectives > 0) && (adjectives.get(0).equalsIgnoreCase(queryWords[i])) ) ? 4:
           ( (numAdjectives > 1) && (adjectives.get(1).equalsIgnoreCase(queryWords[i])) ) ? 2: 1;
   
   queryString.append("contents:" + queryWords[i] + "^" + nf.format(wt) + " ");
   TermQuery tq = new TermQuery( new Term("contents", queryWords[i])); tq.setBoost(wt);
   theQuery.add(tq, BooleanClause.Occur.SHOULD); 
  } //*-- end of for


  //*--------------------------------------------------------------------------
  //*-- 4. Add the query transformation for the part. query type and add the synonyms
  //*--------------------------------------------------------------------------
/*  wt = WT_SYNONYMS;
  for (int j = 0; j < synonyms.length; j++)
  { queryString.append("contents:" + synonyms[j] + "^" + nf.format(wt) + " ");
    TermQuery tq = new TermQuery( new Term("contents", synonyms[j])); tq.setBoost(wt);
    theQuery.add(tq, BooleanClause.Occur.SHOULD);
  }
  */
  wt = WT_TRANSFORM;
  Matcher matcher = whatPattern.matcher(question);
  if ( (matcher.matches()) && (nouns.size() > 0) )
  {  String qTransform = "\"" + nouns.get(0) + "_is" + "\"";
     queryString.append("contents:" + qTransform + "^" + nf.format(wt) + " ");
     TermQuery tq = new TermQuery( new Term("contents", qTransform)); tq.setBoost(wt);
     theQuery.add(tq, BooleanClause.Occur.SHOULD);
     qTransform = "\"" + nouns.get(0) + "_was" + "\"";
     queryString.append("contents:" + qTransform + "^" + nf.format(wt) + " ");
     tq = new TermQuery( new Term("contents", qTransform)); tq.setBoost(wt);
     theQuery.add(tq, BooleanClause.Occur.SHOULD);
  }
  
  matcher = wherePattern.matcher(question);
  if ( (matcher.matches()) && (nouns.size() > 0) )
  {  String qTransform = "is_located" + "\"";
     queryString.append("contents:" + qTransform + "^" + nf.format(wt) + " ");
     TermQuery tq = new TermQuery( new Term("contents", qTransform)); tq.setBoost(wt);
     theQuery.add(tq, BooleanClause.Occur.SHOULD);
     qTransform = "\"located_at\"";
     queryString.append("contents:" + qTransform + "^" + nf.format(wt) + " ");
     tq = new TermQuery( new Term("contents", qTransform)); tq.setBoost(wt);
     theQuery.add(tq, BooleanClause.Occur.SHOULD);
  }
  
//  String query = queryString.toString();
//System.out.println("query string " + query);
//System.out.println("gen q: " + theQuery);


  analyzer.setExtractEntities(false);
  QueryParser qp = new QueryParser("contents", analyzer);  
  try { return(qp.parse(queryString.toString()) ); }
  catch(ParseException pe) { }
  
  return(theQuery);

View Full Code Here

 public SearchQuery() 
 { PropertyConfigurator.configure (Constants.LOG4J_FILE); 
   logger = Logger.getLogger(SearchQuery.class.getName());
   
   //*-- get the index searcher
   try { is = SearchTools.getSearcher(Constants.getINDEXDIR(), false); bgramAnalyzer = new StandardBgramAnalyzer();  }
   catch (IOException ie) { logger.error("Problem with opening the Lucene index directory " + ie.getMessage()); }
     
   //*-- read the spell checker model
   try { readModel(Constants.SPELL_CHECK_MODEL); }
   catch (IOException ie) { logger.error("Could not read the spell checker file " + ie.getMessage()); }

View Full Code Here

  boolean freshIndex = (crawlConfig.getFreshIndex() == -1) ? freshCrawl: 
         (crawlConfig.getFreshIndex() ==  0) ? false: true;
  try  
  { 
   fsd = FSDirectory.getDirectory(new File(Constants.getINDEXDIR()), freshIndex); 
   analyzer = new StandardBgramAnalyzer(); iw = new IndexWriter(fsd, analyzer, freshIndex); 
   iw.setSimilarity(new SearchSimilarity());
   ctRef.setIw(iw); 
  }
  catch (IOException ie) { ctRef.cleanUp("Could not get IndexWriter " + ie.getMessage() ); }

View Full Code Here

  tempFile = null;        fileIW = caller.getIw();                  
  cdoc = new ClassifyDoc();      docsProcessed = 0;   
  dbt = Constants.getDbt();


  //*-- create the RAM based Lucene IndexWriter 
  analyzer = new StandardBgramAnalyzer(); analyzer.setExtractEntities(true);
  ramDir = new RAMDirectory(); //Similarity.setDefault(new SearchSimilarity());
  createRamIW();
  initTime +=  new Date().getTime(); 
 }

View Full Code Here

TOP

Related Classes of org.sf.mustru.utils.StandardBgramAnalyzer

org.apache.lucene.analysis.LowerCaseFilter

org.apache.lucene.analysis.standard.StandardFilter

org.apache.lucene.analysis.standard.StandardTokenizer

org.apache.lucene.analysis.TokenStream

org.sf.mustru.crawl.CrawlManager

org.sf.mustru.crawl.CrawlThread

org.sf.mustru.search.SearchQuery

org.sf.mustru.search.SearchQuestion

org.sf.mustru.test.TestAnalyzers

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.