Examples of org.apache.lucene.analysis.tokenattributes.CharTermAttribute

Package org.apache.lucene.analysis.tokenattributes

Examples of org.apache.lucene.analysis.tokenattributes.CharTermAttribute

org.apache.lucene.analysis.tokenattributes.CharTermAttribute
The term text of a Token.

        List<String> tokens = new ArrayList<String>();
        TokenStream stream = null;
        try {
            stream = analyzer.tokenStream(FieldNames.FULLTEXT,
                    new StringReader(text));
            CharTermAttribute termAtt = stream
                    .addAttribute(CharTermAttribute.class);
            OffsetAttribute offsetAtt = stream
                    .addAttribute(OffsetAttribute.class);
            // TypeAttribute type = stream.addAttribute(TypeAttribute.class);


            stream.reset();


            int poz = 0;
            boolean hasFulltextToken = false;
            StringBuilder token = new StringBuilder();
            while (stream.incrementToken()) {
                String term = termAtt.toString();
                int start = offsetAtt.startOffset();
                int end = offsetAtt.endOffset();
                if (start > poz) {
                    for (int i = poz; i < start; i++) {
                        for (char c : fulltextTokens) {

View Full Code Here

        List<String> tokens = new ArrayList<String>();
        TokenStream stream = null;
        try {
            stream = analyzer.tokenStream(FieldNames.FULLTEXT,
                    new StringReader(text));
            CharTermAttribute termAtt = stream
                    .addAttribute(CharTermAttribute.class);
            OffsetAttribute offsetAtt = stream
                    .addAttribute(OffsetAttribute.class);
            // TypeAttribute type = stream.addAttribute(TypeAttribute.class);


            stream.reset();


            int poz = 0;
            boolean hasFulltextToken = false;
            StringBuilder token = new StringBuilder();
            while (stream.incrementToken()) {
                String term = termAtt.toString();
                int start = offsetAtt.startOffset();
                int end = offsetAtt.endOffset();
                if (start > poz) {
                    for (int i = poz; i < start; i++) {
                        for (char c : fulltextTokens) {

View Full Code Here

    }
    if(!input.incrementToken()) {
      return null;
    }


    CharTermAttribute termAtt = input.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = input.getAttribute(OffsetAttribute.class);
    TypeAttribute typeAtt = input.getAttribute(TypeAttribute.class);


    if(reusableToken == null) {
      reusableToken = new Token();
    }


    reusableToken.clear();
    if(termAtt != null) {
      //lucene 3.0
      //reusableToken.setTermBuffer(termAtt.termBuffer(), 0, termAtt.termLength());
      //lucene 3.1
      reusableToken.copyBuffer(termAtt.buffer(), 0, termAtt.length());
    }
    if(offsetAtt != null) {
      //lucene 3.1
      //reusableToken.setStartOffset(offsetAtt.startOffset());
      //reusableToken.setEndOffset(offsetAtt.endOffset());

View Full Code Here

      stopWordsSet = new HashSet<String>();
      for (String field : fields) {
        TokenStream ts = null;
        try {
          ts = analyzer.tokenStream(field, stopWords);
          CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
          ts.reset();
          while (ts.incrementToken()) {
            stopWordsSet.add(termAtt.toString());
          }
          ts.end();
        } catch (IOException ioe) {
          throw new ParserException("IoException parsing stop words list in "
              + getClass().getName() + ":" + ioe.getLocalizedMessage());

View Full Code Here

      }


      // common case fast-path of first token not matching anything
      AttributeSource firstTok = nextTok();
      if (firstTok == null) return false;
      CharTermAttribute termAtt = firstTok.addAttribute(CharTermAttribute.class);
      SlowSynonymMap result = map.submap!=null ? map.submap.get(termAtt.buffer(), 0, termAtt.length()) : null;
      if (result == null) {
        copy(this, firstTok);
        return true;
      }


      // fast-path failed, clone ourselves if needed
      if (firstTok == this)
        firstTok = cloneAttributes();
      // OK, we matched a token, so find the longest match.


      matched = new LinkedList<AttributeSource>();


      result = match(result);


      if (result==null) {
        // no match, simply return the first token read.
        copy(this, firstTok);
        return true;
      }


      // reuse, or create new one each time?
      ArrayList<AttributeSource> generated = new ArrayList<AttributeSource>(result.synonyms.length + matched.size() + 1);


      //
      // there was a match... let's generate the new tokens, merging
      // in the matched tokens (position increments need adjusting)
      //
      AttributeSource lastTok = matched.isEmpty() ? firstTok : matched.getLast();
      boolean includeOrig = result.includeOrig();


      AttributeSource origTok = includeOrig ? firstTok : null;
      PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute(PositionIncrementAttribute.class);
      int origPos = firstPosIncAtt.getPositionIncrement();  // position of origTok in the original stream
      int repPos=0; // curr position in replacement token stream
      int pos=0;  // current position in merged token stream


      for (int i=0; i<result.synonyms.length; i++) {
        Token repTok = result.synonyms[i];
        AttributeSource newTok = firstTok.cloneAttributes();
        CharTermAttribute newTermAtt = newTok.addAttribute(CharTermAttribute.class);
        OffsetAttribute newOffsetAtt = newTok.addAttribute(OffsetAttribute.class);
        PositionIncrementAttribute newPosIncAtt = newTok.addAttribute(PositionIncrementAttribute.class);


        OffsetAttribute lastOffsetAtt = lastTok.addAttribute(OffsetAttribute.class);


        newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset());
        newTermAtt.copyBuffer(repTok.buffer(), 0, repTok.length());
        repPos += repTok.getPositionIncrement();
        if (i==0) repPos=origPos;  // make position of first token equal to original


        // if necessary, insert original tokens and adjust position increment
        while (origTok != null && origPos <= repPos) {

View Full Code Here

      if (tok != null) {
        // clone ourselves.
        if (tok == this)
          tok = cloneAttributes();
        // check for positionIncrement!=1?  if>1, should not match, if==0, check multiple at this level?
        CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class);
        SlowSynonymMap subMap = map.submap.get(termAtt.buffer(), 0, termAtt.length());


        if (subMap != null) {
          // recurse
          result = match(subMap);
        }

View Full Code Here

  private static List<String> splitByTokenizer(String source, TokenizerFactory tokFactory) throws IOException{
    StringReader reader = new StringReader( source );
    TokenStream ts = loadTokenizer(tokFactory, reader);
    List<String> tokList = new ArrayList<String>();
    try {
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      ts.reset();
      while (ts.incrementToken()){
        if( termAtt.length() > 0 )
          tokList.add( termAtt.toString() );
      }
    } finally{
      reader.close();
    }
    return tokList;

View Full Code Here

    TokenStream ts = analyzer.tokenStream("content", "this sentence");
    try {
      int j = -1;
    
      PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    
      ts.reset();
      while (ts.incrementToken()) {
        j += posIncrAtt.getPositionIncrement();
        String termText = termAtt.toString();
        q.add(new Term("content", termText), j);
      }
      ts.end();
    } finally {
      IOUtils.closeWhileHandlingException(ts);

View Full Code Here

  public void testShingleAnalyzerWrapperBooleanQuery() throws Exception {
    BooleanQuery q = new BooleanQuery();


    TokenStream ts = analyzer.tokenStream("content", "test sentence");
    try {
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    
      ts.reset();
      while (ts.incrementToken()) {
        String termText =  termAtt.toString();
        q.add(new TermQuery(new Term("content", termText)),
            BooleanClause.Occur.SHOULD);
      }
      ts.end();
    } finally {

View Full Code Here


      if (!buffer.hasAttribute(CharTermAttribute.class)) {
        return new NoTokenFoundQueryNode();
      }


      CharTermAttribute termAtt = buffer.getAttribute(CharTermAttribute.class);


      if (numTokens == 0) {
        return new NoTokenFoundQueryNode();


      } else if (numTokens == 1) {
        String term = null;
        try {
          boolean hasNext;
          hasNext = buffer.incrementToken();
          assert hasNext == true;
          term = termAtt.toString();


        } catch (IOException e) {
          // safe to ignore, because we know the number of tokens
        }


        fieldNode.setText(term);


        return fieldNode;


      } else if (severalTokensAtSamePosition || !(node instanceof QuotedFieldQueryNode)) {
        if (positionCount == 1 || !(node instanceof QuotedFieldQueryNode)) {
          // no phrase query:
          
          if (positionCount == 1) { 
            // simple case: only one position, with synonyms
            LinkedList<QueryNode> children = new LinkedList<QueryNode>();
            
            for (int i = 0; i < numTokens; i++) {
              String term = null;
              try {
                boolean hasNext = buffer.incrementToken();
                assert hasNext == true;
                term = termAtt.toString();
                
              } catch (IOException e) {
                // safe to ignore, because we know the number of tokens
              }
              
              children.add(new FieldQueryNode(field, term, -1, -1));
              
            }
            return new GroupQueryNode(
                new StandardBooleanQueryNode(children, positionCount==1));
            
          } else {
            // multiple positions
            QueryNode q = new StandardBooleanQueryNode(Collections.<QueryNode>emptyList(),false);
            QueryNode currentQuery = null;
            for (int i = 0; i < numTokens; i++) {
              String term = null;
              try {
                boolean hasNext = buffer.incrementToken();
                assert hasNext == true;
                term = termAtt.toString();
              } catch (IOException e) {
                // safe to ignore, because we know the number of tokens
              }
              if (posIncrAtt != null && posIncrAtt.getPositionIncrement() == 0) {
                if (!(currentQuery instanceof BooleanQueryNode)) {
                  QueryNode t = currentQuery;
                  currentQuery = new StandardBooleanQueryNode(Collections.<QueryNode>emptyList(), true);
                  ((BooleanQueryNode)currentQuery).add(t);
                }
                ((BooleanQueryNode)currentQuery).add(new FieldQueryNode(field, term, -1, -1));
              } else {
                if (currentQuery != null) {
                  if (this.defaultOperator == Operator.OR) {
                    q.add(currentQuery);
                  } else {
                    q.add(new ModifierQueryNode(currentQuery, Modifier.MOD_REQ));
                  }
                }
                currentQuery = new FieldQueryNode(field, term, -1, -1);
              }
            }
            if (this.defaultOperator == Operator.OR) {
              q.add(currentQuery);
            } else {
              q.add(new ModifierQueryNode(currentQuery, Modifier.MOD_REQ));
            }
            
            if (q instanceof BooleanQueryNode) {
              q = new GroupQueryNode(q);
            }
            return q;
          }
        } else {
          // phrase query:
          MultiPhraseQueryNode mpq = new MultiPhraseQueryNode();


          List<FieldQueryNode> multiTerms = new ArrayList<FieldQueryNode>();
          int position = -1;
          int i = 0;
          int termGroupCount = 0;
          for (; i < numTokens; i++) {
            String term = null;
            int positionIncrement = 1;
            try {
              boolean hasNext = buffer.incrementToken();
              assert hasNext == true;
              term = termAtt.toString();
              if (posIncrAtt != null) {
                positionIncrement = posIncrAtt.getPositionIncrement();
              }


            } catch (IOException e) {
              // safe to ignore, because we know the number of tokens
            }


            if (positionIncrement > 0 && multiTerms.size() > 0) {


              for (FieldQueryNode termNode : multiTerms) {


                if (this.positionIncrementsEnabled) {
                  termNode.setPositionIncrement(position);
                } else {
                  termNode.setPositionIncrement(termGroupCount);
                }


                mpq.add(termNode);


              }


              // Only increment once for each "group" of
              // terms that were in the same position:
              termGroupCount++;


              multiTerms.clear();


            }


            position += positionIncrement;
            multiTerms.add(new FieldQueryNode(field, term, -1, -1));


          }


          for (FieldQueryNode termNode : multiTerms) {


            if (this.positionIncrementsEnabled) {
              termNode.setPositionIncrement(position);


            } else {
              termNode.setPositionIncrement(termGroupCount);
            }


            mpq.add(termNode);


          }


          return mpq;


        }


      } else {


        TokenizedPhraseQueryNode pq = new TokenizedPhraseQueryNode();


        int position = -1;


        for (int i = 0; i < numTokens; i++) {
          String term = null;
          int positionIncrement = 1;


          try {
            boolean hasNext = buffer.incrementToken();
            assert hasNext == true;
            term = termAtt.toString();


            if (posIncrAtt != null) {
              positionIncrement = posIncrAtt.getPositionIncrement();
            }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.lucene.analysis.tokenattributes.CharTermAttribute

com.chenlb.mmseg4j.analysis.TokenUtils

com.code972.elasticsearch.rest.action.RestHebrewAnalyzerCheckWordAction

com.digitalpebble.behemoth.mahout.LuceneTokenizerMapper

com.gentics.cr.lucene.analysis.CustomPatternAnalyzerTest

com.ikanow.infinit.e.api.knowledge.SearchHandler

com.livingsocial.hive.udf.Tokenize

com.mozilla.grouperfish.lucene.analysis.en.NGramEnglishAnalyzer

com.mozilla.grouperfish.pig.eval.text.NGramTokenize

com.mozilla.grouperfish.pig.eval.text.Tokenize

ivory.core.tokenize.LuceneArabicAnalyzer

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.