Examples of org.apache.lucene.analysis.tokenattributes.CharTermAttribute

Package org.apache.lucene.analysis.tokenattributes

Examples of org.apache.lucene.analysis.tokenattributes.CharTermAttribute

org.apache.lucene.analysis.tokenattributes.CharTermAttribute
The term text of a Token.

      //make sure we produce the same tokens
      TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.toString()))));
      TokenStream sink = teeStream.newSinkTokenStream(new ModuloSinkFilter(100));
      teeStream.consumeAllTokens();
      TokenStream stream = new ModuloTokenFilter(new StandardFilter(new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.toString()))), 100);
      CharTermAttribute tfTok = stream.addAttribute(CharTermAttribute.class);
      CharTermAttribute sinkTok = sink.addAttribute(CharTermAttribute.class);
      for (int i=0; stream.incrementToken(); i++) {
        assertTrue(sink.incrementToken());
        assertTrue(tfTok + " is not equal to " + sinkTok + " at token: " + i, tfTok.equals(sinkTok) == true);
      }

View Full Code Here

    // and ensure they are the same as the ones we produced in serial fashion.


    for (int i = 0; i < numTestPoints; i++) {
      String term = _TestUtil.randomSimpleString(random);
      TokenStream ts = analyzer.reusableTokenStream("fake", new StringReader(term));
      CharTermAttribute encodedBytes = ts.addAttribute(CharTermAttribute.class);
      ts.reset();
      assertTrue(ts.incrementToken());
      // ensure we make a copy of the actual bytes too
      map.put(term, encodedBytes.toString());
    }
    
    Thread threads[] = new Thread[numThreads];
    for (int i = 0; i < numThreads; i++) {
      threads[i] = new Thread() {
        @Override
        public void run() {
          try {
            for (Map.Entry<String,String> mapping : map.entrySet()) {
              String term = mapping.getKey();
              String expected = mapping.getValue();
              TokenStream ts = analyzer.reusableTokenStream("fake", new StringReader(term));
              CharTermAttribute encodedBytes = ts.addAttribute(CharTermAttribute.class);
              ts.reset();
              assertTrue(ts.incrementToken());
              assertEquals(expected, encodedBytes.toString());
            }
          } catch (IOException e) {
            throw new RuntimeException(e);
          }
        }

View Full Code Here

    throws IOException, InvalidTokenOffsetsException
  {
    ArrayList<TextFragment> docFrags = new ArrayList<TextFragment>();
    StringBuilder newText=new StringBuilder();
    
      CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
      OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
      tokenStream.addAttribute(PositionIncrementAttribute.class);
      tokenStream.reset();
      
    TextFragment currentFrag =  new TextFragment(newText,newText.length(), docFrags.size());
    
    if (fragmentScorer instanceof QueryScorer) {
      ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(maxDocCharsToAnalyze);
    }
    
    TokenStream newStream = fragmentScorer.init(tokenStream);
    if(newStream != null) {
      tokenStream = newStream;
    }
    fragmentScorer.startFragment(currentFrag);
    docFrags.add(currentFrag);


    FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);


    try
    {


      String tokenText;
      int startOffset;
      int endOffset;
      int lastEndOffset = 0;
      textFragmenter.start(text, tokenStream);


      TokenGroup tokenGroup=new TokenGroup(tokenStream);


      for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset()< maxDocCharsToAnalyze);
            next = tokenStream.incrementToken())
      {
        if(  (offsetAtt.endOffset()>text.length())
          ||
          (offsetAtt.startOffset()>text.length())
          )            
        {
          throw new InvalidTokenOffsetsException("Token "+ termAtt.toString()
              +" exceeds length of provided text sized "+text.length());
        }
        if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct()))
        {
          //the current token is distinct from previous tokens -

View Full Code Here

        List<String> tokens = new ArrayList<String>();
        TokenStream stream = null;
        try {
            stream = analyzer.tokenStream(FieldNames.FULLTEXT,
                    new StringReader(text));
            CharTermAttribute termAtt = stream
                    .addAttribute(CharTermAttribute.class);
            OffsetAttribute offsetAtt = stream
                    .addAttribute(OffsetAttribute.class);
            // TypeAttribute type = stream.addAttribute(TypeAttribute.class);


            stream.reset();


            int poz = 0;
            boolean hasFulltextToken = false;
            StringBuilder token = new StringBuilder();
            while (stream.incrementToken()) {
                String term = termAtt.toString();
                int start = offsetAtt.startOffset();
                int end = offsetAtt.endOffset();
                if (start > poz) {
                    for (int i = poz; i < start; i++) {
                        for (char c : fulltextTokens) {

View Full Code Here

        displayTokens(analyzer.tokenStream("contents", new StringReader(text)));
    }


    public static void displayTokens(TokenStream stream) throws IOException {


        CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
        while (stream.incrementToken()) {
            System.out.println("[" + term.toString() + "] ");
        }


    }

View Full Code Here

            throws IOException {


        TokenStream stream = analyzer.tokenStream("contents",
                new StringReader(text));


        CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posIncr = stream
                .addAttribute(PositionIncrementAttribute.class);


        int position = 0;
        while (stream.incrementToken()) {


            int increment = posIncr.getPositionIncrement();
            if (increment > 0) {
                position = position + increment;
                System.out.println();
                System.out.print(position + ":");
            }


            System.out.print("[" + term.toString() + "] ");


        }
        System.out.println();


    }

View Full Code Here

            throws IOException {


        TokenStream stream = analyzer.tokenStream("contents",
                new StringReader(text));


        CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posIncr = stream
                .addAttribute(PositionIncrementAttribute.class);
        OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
        TypeAttribute type = stream.addAttribute(TypeAttribute.class);
        PayloadAttribute payload = stream.addAttribute(PayloadAttribute.class);


        int position = 0;
        while (stream.incrementToken()) {


            int increment = posIncr.getPositionIncrement();
            if (increment > 0) {
                position = position + increment;
                System.out.println();
                System.out.print(position + ":");
            }


            BytesRef pl = payload.getPayload();


            if (pl != null) {
                System.out.print("[" + term.toString() + ":" + offset.startOffset()
                        + "->" + offset.endOffset() + ":" + type.type() + ":"
                        + new String(pl.bytes) + "] ");


            } else {
                System.out.print("[" + term.toString() + ":" + offset.startOffset()
                        + "->" + offset.endOffset() + ":" + type.type() + "] ");


            }
        }
        System.out.println();

View Full Code Here

    config.set("pattern", "\\s+");
    config.set("lowercase", "false");
    CustomPatternAnalyzer a = new CustomPatternAnalyzer(config);
    
    TokenStream tokenStream = a.tokenStream("test", "this is a Text with Whitespaces");
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);


    tokenStream.incrementToken();
    String t2 = charTermAttribute.toString();
    tokenStream.incrementToken();
    String t3 = charTermAttribute.toString();
    
    assertEquals("Second Token did not match!", "Text", t2);
    assertEquals("Third Token did not match!", "Whitespaces", t3);
    
  }

View Full Code Here

    config.set("pattern", "\\s+");
    config.set("lowercase", "true");
    CustomPatternAnalyzer a = new CustomPatternAnalyzer(config);
    
    TokenStream tokenStream = a.tokenStream("test", "this is a Text with Whitespaces");
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);


    tokenStream.incrementToken();
    String t2 = charTermAttribute.toString();
    tokenStream.incrementToken();
    String t3 = charTermAttribute.toString();
    
    assertEquals("Second Token did not match!", "text", t2);
    assertEquals("Third Token did not match!", "whitespaces", t3);
    
  }

View Full Code Here

    GenericConfiguration config = new GenericConfiguration();
    config.set("pattern", "\\s+");
    CustomPatternAnalyzer a = new CustomPatternAnalyzer(config);
    
    TokenStream tokenStream = a.tokenStream("test", "this is a Text with Whitespaces");
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);


    tokenStream.incrementToken();
    String t2 = charTermAttribute.toString();
    tokenStream.incrementToken();
    String t3 = charTermAttribute.toString();
    
    assertEquals("Second Token did not match!", "text", t2);
    assertEquals("Third Token did not match!", "whitespaces", t3);
    
  }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.lucene.analysis.tokenattributes.CharTermAttribute

com.chenlb.mmseg4j.analysis.TokenUtils

com.code972.elasticsearch.rest.action.RestHebrewAnalyzerCheckWordAction

com.digitalpebble.behemoth.mahout.LuceneTokenizerMapper

com.gentics.cr.lucene.analysis.CustomPatternAnalyzerTest

com.ikanow.infinit.e.api.knowledge.SearchHandler

com.livingsocial.hive.udf.Tokenize

com.mozilla.grouperfish.lucene.analysis.en.NGramEnglishAnalyzer

com.mozilla.grouperfish.pig.eval.text.NGramTokenize

com.mozilla.grouperfish.pig.eval.text.Tokenize

ivory.core.tokenize.LuceneArabicAnalyzer

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.