Package org.apache.lucene.analysis.tokenattributes

Examples of org.apache.lucene.analysis.tokenattributes.CharTermAttribute


        List<String> tokens = new ArrayList<String>();
        TokenStream stream = null;
        try {
            stream = analyzer.tokenStream(FieldNames.FULLTEXT,
                    new StringReader(text));
            CharTermAttribute termAtt = stream
                    .addAttribute(CharTermAttribute.class);
            OffsetAttribute offsetAtt = stream
                    .addAttribute(OffsetAttribute.class);
            // TypeAttribute type = stream.addAttribute(TypeAttribute.class);

            stream.reset();

            int poz = 0;
            boolean hasFulltextToken = false;
            StringBuilder token = new StringBuilder();
            while (stream.incrementToken()) {
                String term = termAtt.toString();
                int start = offsetAtt.startOffset();
                int end = offsetAtt.endOffset();
                if (start > poz) {
                    for (int i = poz; i < start; i++) {
                        for (char c : fulltextTokens) {
View Full Code Here


        List<String> tokens = new ArrayList<String>();
        TokenStream stream = null;
        try {
            stream = analyzer.tokenStream(FieldNames.FULLTEXT,
                    new StringReader(text));
            CharTermAttribute termAtt = stream
                    .addAttribute(CharTermAttribute.class);
            OffsetAttribute offsetAtt = stream
                    .addAttribute(OffsetAttribute.class);
            // TypeAttribute type = stream.addAttribute(TypeAttribute.class);

            stream.reset();

            int poz = 0;
            boolean hasFulltextToken = false;
            StringBuilder token = new StringBuilder();
            while (stream.incrementToken()) {
                String term = termAtt.toString();
                int start = offsetAtt.startOffset();
                int end = offsetAtt.endOffset();
                if (start > poz) {
                    for (int i = poz; i < start; i++) {
                        for (char c : fulltextTokens) {
View Full Code Here

    }
    if(!input.incrementToken()) {
      return null;
    }

    CharTermAttribute termAtt = input.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = input.getAttribute(OffsetAttribute.class);
    TypeAttribute typeAtt = input.getAttribute(TypeAttribute.class);

    if(reusableToken == null) {
      reusableToken = new Token();
    }

    reusableToken.clear();
    if(termAtt != null) {
      //lucene 3.0
      //reusableToken.setTermBuffer(termAtt.termBuffer(), 0, termAtt.termLength());
      //lucene 3.1
      reusableToken.copyBuffer(termAtt.buffer(), 0, termAtt.length());
    }
    if(offsetAtt != null) {
      //lucene 3.1
      //reusableToken.setStartOffset(offsetAtt.startOffset());
      //reusableToken.setEndOffset(offsetAtt.endOffset());
View Full Code Here

      stopWordsSet = new HashSet<String>();
      for (String field : fields) {
        TokenStream ts = null;
        try {
          ts = analyzer.tokenStream(field, stopWords);
          CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
          ts.reset();
          while (ts.incrementToken()) {
            stopWordsSet.add(termAtt.toString());
          }
          ts.end();
        } catch (IOException ioe) {
          throw new ParserException("IoException parsing stop words list in "
              + getClass().getName() + ":" + ioe.getLocalizedMessage());
View Full Code Here

      }

      // common case fast-path of first token not matching anything
      AttributeSource firstTok = nextTok();
      if (firstTok == null) return false;
      CharTermAttribute termAtt = firstTok.addAttribute(CharTermAttribute.class);
      SlowSynonymMap result = map.submap!=null ? map.submap.get(termAtt.buffer(), 0, termAtt.length()) : null;
      if (result == null) {
        copy(this, firstTok);
        return true;
      }

      // fast-path failed, clone ourselves if needed
      if (firstTok == this)
        firstTok = cloneAttributes();
      // OK, we matched a token, so find the longest match.

      matched = new LinkedList<AttributeSource>();

      result = match(result);

      if (result==null) {
        // no match, simply return the first token read.
        copy(this, firstTok);
        return true;
      }

      // reuse, or create new one each time?
      ArrayList<AttributeSource> generated = new ArrayList<AttributeSource>(result.synonyms.length + matched.size() + 1);

      //
      // there was a match... let's generate the new tokens, merging
      // in the matched tokens (position increments need adjusting)
      //
      AttributeSource lastTok = matched.isEmpty() ? firstTok : matched.getLast();
      boolean includeOrig = result.includeOrig();

      AttributeSource origTok = includeOrig ? firstTok : null;
      PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute(PositionIncrementAttribute.class);
      int origPos = firstPosIncAtt.getPositionIncrement()// position of origTok in the original stream
      int repPos=0; // curr position in replacement token stream
      int pos=0// current position in merged token stream

      for (int i=0; i<result.synonyms.length; i++) {
        Token repTok = result.synonyms[i];
        AttributeSource newTok = firstTok.cloneAttributes();
        CharTermAttribute newTermAtt = newTok.addAttribute(CharTermAttribute.class);
        OffsetAttribute newOffsetAtt = newTok.addAttribute(OffsetAttribute.class);
        PositionIncrementAttribute newPosIncAtt = newTok.addAttribute(PositionIncrementAttribute.class);

        OffsetAttribute lastOffsetAtt = lastTok.addAttribute(OffsetAttribute.class);

        newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset());
        newTermAtt.copyBuffer(repTok.buffer(), 0, repTok.length());
        repPos += repTok.getPositionIncrement();
        if (i==0) repPos=origPos;  // make position of first token equal to original

        // if necessary, insert original tokens and adjust position increment
        while (origTok != null && origPos <= repPos) {
View Full Code Here

      if (tok != null) {
        // clone ourselves.
        if (tok == this)
          tok = cloneAttributes();
        // check for positionIncrement!=1?  if>1, should not match, if==0, check multiple at this level?
        CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class);
        SlowSynonymMap subMap = map.submap.get(termAtt.buffer(), 0, termAtt.length());

        if (subMap != null) {
          // recurse
          result = match(subMap);
        }
View Full Code Here

  private static List<String> splitByTokenizer(String source, TokenizerFactory tokFactory) throws IOException{
    StringReader reader = new StringReader( source );
    TokenStream ts = loadTokenizer(tokFactory, reader);
    List<String> tokList = new ArrayList<String>();
    try {
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      ts.reset();
      while (ts.incrementToken()){
        if( termAtt.length() > 0 )
          tokList.add( termAtt.toString() );
      }
    } finally{
      reader.close();
    }
    return tokList;
View Full Code Here

    TokenStream ts = analyzer.tokenStream("content", "this sentence");
    try {
      int j = -1;
   
      PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
   
      ts.reset();
      while (ts.incrementToken()) {
        j += posIncrAtt.getPositionIncrement();
        String termText = termAtt.toString();
        q.add(new Term("content", termText), j);
      }
      ts.end();
    } finally {
      IOUtils.closeWhileHandlingException(ts);
View Full Code Here

  public void testShingleAnalyzerWrapperBooleanQuery() throws Exception {
    BooleanQuery q = new BooleanQuery();

    TokenStream ts = analyzer.tokenStream("content", "test sentence");
    try {
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
   
      ts.reset();
      while (ts.incrementToken()) {
        String termText =  termAtt.toString();
        q.add(new TermQuery(new Term("content", termText)),
            BooleanClause.Occur.SHOULD);
      }
      ts.end();
    } finally {
View Full Code Here

      if (!buffer.hasAttribute(CharTermAttribute.class)) {
        return new NoTokenFoundQueryNode();
      }

      CharTermAttribute termAtt = buffer.getAttribute(CharTermAttribute.class);

      if (numTokens == 0) {
        return new NoTokenFoundQueryNode();

      } else if (numTokens == 1) {
        String term = null;
        try {
          boolean hasNext;
          hasNext = buffer.incrementToken();
          assert hasNext == true;
          term = termAtt.toString();

        } catch (IOException e) {
          // safe to ignore, because we know the number of tokens
        }

        fieldNode.setText(term);

        return fieldNode;

      } else if (severalTokensAtSamePosition || !(node instanceof QuotedFieldQueryNode)) {
        if (positionCount == 1 || !(node instanceof QuotedFieldQueryNode)) {
          // no phrase query:
         
          if (positionCount == 1) {
            // simple case: only one position, with synonyms
            LinkedList<QueryNode> children = new LinkedList<QueryNode>();
           
            for (int i = 0; i < numTokens; i++) {
              String term = null;
              try {
                boolean hasNext = buffer.incrementToken();
                assert hasNext == true;
                term = termAtt.toString();
               
              } catch (IOException e) {
                // safe to ignore, because we know the number of tokens
              }
             
              children.add(new FieldQueryNode(field, term, -1, -1));
             
            }
            return new GroupQueryNode(
                new StandardBooleanQueryNode(children, positionCount==1));
           
          } else {
            // multiple positions
            QueryNode q = new StandardBooleanQueryNode(Collections.<QueryNode>emptyList(),false);
            QueryNode currentQuery = null;
            for (int i = 0; i < numTokens; i++) {
              String term = null;
              try {
                boolean hasNext = buffer.incrementToken();
                assert hasNext == true;
                term = termAtt.toString();
              } catch (IOException e) {
                // safe to ignore, because we know the number of tokens
              }
              if (posIncrAtt != null && posIncrAtt.getPositionIncrement() == 0) {
                if (!(currentQuery instanceof BooleanQueryNode)) {
                  QueryNode t = currentQuery;
                  currentQuery = new StandardBooleanQueryNode(Collections.<QueryNode>emptyList(), true);
                  ((BooleanQueryNode)currentQuery).add(t);
                }
                ((BooleanQueryNode)currentQuery).add(new FieldQueryNode(field, term, -1, -1));
              } else {
                if (currentQuery != null) {
                  if (this.defaultOperator == Operator.OR) {
                    q.add(currentQuery);
                  } else {
                    q.add(new ModifierQueryNode(currentQuery, Modifier.MOD_REQ));
                  }
                }
                currentQuery = new FieldQueryNode(field, term, -1, -1);
              }
            }
            if (this.defaultOperator == Operator.OR) {
              q.add(currentQuery);
            } else {
              q.add(new ModifierQueryNode(currentQuery, Modifier.MOD_REQ));
            }
           
            if (q instanceof BooleanQueryNode) {
              q = new GroupQueryNode(q);
            }
            return q;
          }
        } else {
          // phrase query:
          MultiPhraseQueryNode mpq = new MultiPhraseQueryNode();

          List<FieldQueryNode> multiTerms = new ArrayList<FieldQueryNode>();
          int position = -1;
          int i = 0;
          int termGroupCount = 0;
          for (; i < numTokens; i++) {
            String term = null;
            int positionIncrement = 1;
            try {
              boolean hasNext = buffer.incrementToken();
              assert hasNext == true;
              term = termAtt.toString();
              if (posIncrAtt != null) {
                positionIncrement = posIncrAtt.getPositionIncrement();
              }

            } catch (IOException e) {
              // safe to ignore, because we know the number of tokens
            }

            if (positionIncrement > 0 && multiTerms.size() > 0) {

              for (FieldQueryNode termNode : multiTerms) {

                if (this.positionIncrementsEnabled) {
                  termNode.setPositionIncrement(position);
                } else {
                  termNode.setPositionIncrement(termGroupCount);
                }

                mpq.add(termNode);

              }

              // Only increment once for each "group" of
              // terms that were in the same position:
              termGroupCount++;

              multiTerms.clear();

            }

            position += positionIncrement;
            multiTerms.add(new FieldQueryNode(field, term, -1, -1));

          }

          for (FieldQueryNode termNode : multiTerms) {

            if (this.positionIncrementsEnabled) {
              termNode.setPositionIncrement(position);

            } else {
              termNode.setPositionIncrement(termGroupCount);
            }

            mpq.add(termNode);

          }

          return mpq;

        }

      } else {

        TokenizedPhraseQueryNode pq = new TokenizedPhraseQueryNode();

        int position = -1;

        for (int i = 0; i < numTokens; i++) {
          String term = null;
          int positionIncrement = 1;

          try {
            boolean hasNext = buffer.incrementToken();
            assert hasNext == true;
            term = termAtt.toString();

            if (posIncrAtt != null) {
              positionIncrement = posIncrAtt.getPositionIncrement();
            }
View Full Code Here

TOP

Related Classes of org.apache.lucene.analysis.tokenattributes.CharTermAttribute

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.