Package org.apache.lucene.analysis

Examples of org.apache.lucene.analysis.Token$TokenAttributeFactory


  // overridden
  public Token next() throws IOException {
    if (mIter.hasNext()) {
      String text = (String) mIter.next();
      return new Token(text, 0, text.length());
    } else {
      return null;
    }
  }
View Full Code Here


    Token[] tokens = tokensFromAnalysis(analyzer, text);

    int position = 0;
    for (int i = 0; i < tokens.length; i++)
     {
      Token token = tokens[i];
      int increment = token.getPositionIncrement();
      if (increment > 0)
      { position = position + increment;
        System.out.println();
        System.out.print(position + ": ");
      }
      System.out.print("\t [" + token.termText() + ": " + token.type() + "] " + token.startOffset() + ":" + token.endOffset());
     
     } //*-- end of for
    System.out.println("");
  }
View Full Code Here

   * Use the passed analyzer to get a list of tokens from the text
   */
  private static Token[] tokensFromAnalysis(Analyzer analyzer, String text) throws IOException
  {
    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
    ArrayList<Token> tokenList = new ArrayList<Token>(); Token token = null;
    while ( (token = stream.next()) != null) tokenList.add(token);
    Token[] tokens = new Token[tokenList.size()];
    for (int i = 0; i < tokens.length; i++) tokens[i] = tokenList.get(i);
    return (tokens);
    }
View Full Code Here

  String[] synonyms = synBuffer.toString().trim().split(" ");
*/
  //*-- tokenize the question
  StandardBgramAnalyzer analyzer = new StandardBgramAnalyzer(); analyzer.setExtractEntities(true);
  TokenStream stream = analyzer.tokenStream("contents", new StringReader(question));
  ArrayList<Token> tokenList = new ArrayList<Token>(); Token token = null;
  entities = new ArrayList<String>();    //*-- list of entities in the question
  while ( (token = stream.next()) != null)
   { tokenList.add(token); if (token.type().equals("<ENTITY>")) entities.add(token.termText()); }
  //*-------------------------------------------------------------------
  //*-- build the query with the five components
  //*--
  //*-- 1. First identify the entity types for the query
View Full Code Here

     int beginIndex = (prevSentence.length() > addChars) ? prevSentence.length() - addChars: 0;
     sentence = prevSentence.substring(beginIndex, prevSentence.length()) + sentence;
   }
   //*-- build a list of tokens from the sentence
   TokenStream stream = analyzer.tokenStream("contents", new StringReader(sentence));
   ArrayList<Token> tokenList = new ArrayList<Token>(); Token token = null;
   while ( (token = stream.next()) != null) tokenList.add(token);
  
   //*-- initialize the boolean arrays and scores
   boolean[] foundWords = new boolean[tokenList.size()];
   for (int i = 0; i < nouns.length; i++) foundNouns[i] = false;
View Full Code Here

  * Use the passed analyzer to get a list of tokens from the text
  */
private static Token[] tokensFromAnalysis(Analyzer analyzer, String text) throws IOException
{
   TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
   ArrayList<Token> tokenList = new ArrayList<Token>(); Token token = null;
   while ( (token = stream.next()) != null) tokenList.add(token);
   Token[] tokens = new Token[tokenList.size()];
   for (int i = 0; i < tokens.length; i++) tokens[i] = tokenList.get(i);
   return (tokens);
   }
View Full Code Here

     * Use the passed analyzer to get a list of tokens from the text
     */
    private static Token[] tokensFromAnalysis(Analyzer analyzer, String text
    {
      TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
      ArrayList<Token> tokenList = new ArrayList<Token>(); Token token = null;
      try { while ( (token = stream.next()) != null) tokenList.add(token); }
      catch (IOException ie) { System.err.println("Tokenizer problem: " + ie.getMessage()); }
      Token[] tokens = new Token[tokenList.size()];
      for (int i = 0; i < tokens.length; i++) tokens[i] = tokenList.get(i);
      return (tokens);
View Full Code Here

  public final Token next() throws IOException
  {
  
   //*-- check for a saved token
   if (stoken != null)
    { Token tempToken = stoken; stoken = null;
      tempToken.setPositionIncrement(0); return(tempToken); }
  
   Token ctoken = input.next()if (ctoken == null) return (null);
   String ctext = ctoken.termText();
  
   //*-- check for an unigram entity
   String etype = ehash.get(ctext);
   if (etype != null) stoken = new Token(etype, ctoken.startOffset(), ctoken.endOffset(), "<ENTITY>");
  
   //*-- check for a bigram entity
   if (ptoken != null)   
   { String bigram = ptoken.termText() + " " + ctext;
     etype = ehash.get(bigram);
     if (etype != null)
      stoken = new Token(etype, ptoken.startOffset(), ctoken.endOffset(), "<ENTITY>");
   }
  
   if (number.matcher(ctext).matches())
     stoken = new Token("enumber", ctoken.startOffset(), ctoken.endOffset(), "<ENTITY>");
  
   ptoken = ctoken;
   return (ctoken);
  }
View Full Code Here

     */
    public final Token next() throws IOException
    {
     //*-- if a token was saved, return it
     if (stoken != null) //*-- no. 1
     { Token tempToken = stoken; stoken = null;
       return(tempToken); }
     
     //*-- get the next token and check if it is a stop word
     ctoken = input.next()
     if (ctoken == null) return (null);    //*-- nos. 7 and 8
     String ctokenText = ctoken.termText();
     //*-- set stopword status: True if in stopwords list, or does NOT contain an alphabetic character
     cstop = ( stopWords.contains(ctokenText) ) ? true: (wordPattern.matcher(ctokenText).matches()) ? false: true;
    
     //*-- if there was a previous token
     if (ptoken == null)
     { //*-- continue if the current token is a stop word
       if (cstop)   //*-- no. 5
       { ptoken = ctoken; pstop = cstop;
       ctoken = input.next(); cstop = (ctoken != null) ? stopWords.contains(ctoken.termText()): false;
       } //*-- otherwise, return the current token
       else     //*-- no. 6
       { ptoken = ctoken; pstop = cstop; return(ctoken); }
     }
    
     //*-- if the current token is a stop word, return a
     //*-- bigram from the previous and current tokens
     if (cstop)
      { Token tempToken = createBigram(ptoken, ctoken);
        if (!stopWords.contains(ptoken.termText()) ) tempToken.setPositionIncrement(0); //*-- no. 3
        ptoken = ctoken; pstop = cstop; return(tempToken); }    //*-- nos. 3 and 4
    
     //*-- if the previous token was a stop word, first save
     //*-- the current token and return a bigram from previous
     //*-- and current tokens. In next call, return the saved token
     if (pstop)    //*-- no. 1
     { if (ctoken != null)
      { stoken = ctoken; stoken.setPositionIncrement(0);
        Token tempToken = createBigram(ptoken, ctoken);
        ptoken = ctoken; pstop = cstop; return(tempToken);
      }
     }
    
     ptoken = ctoken; pstop = cstop;
View Full Code Here

    }
  
    //*-- return a bigram token from the passed tokens
   protected Token createBigram(Token aToken, Token bToken)
   { String bigram = aToken.termText() + "_" + bToken.termText();
     return ( new Token(bigram, aToken.startOffset(), bToken.endOffset(), "<BIGRAM>" ) );
   }
View Full Code Here

TOP

Related Classes of org.apache.lucene.analysis.Token$TokenAttributeFactory

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.