Package org.apache.lucene.analysis.tokenattributes

Examples of org.apache.lucene.analysis.tokenattributes.CharTermAttribute


      //make sure we produce the same tokens
      TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.toString()))));
      TokenStream sink = teeStream.newSinkTokenStream(new ModuloSinkFilter(100));
      teeStream.consumeAllTokens();
      TokenStream stream = new ModuloTokenFilter(new StandardFilter(new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.toString()))), 100);
      CharTermAttribute tfTok = stream.addAttribute(CharTermAttribute.class);
      CharTermAttribute sinkTok = sink.addAttribute(CharTermAttribute.class);
      for (int i=0; stream.incrementToken(); i++) {
        assertTrue(sink.incrementToken());
        assertTrue(tfTok + " is not equal to " + sinkTok + " at token: " + i, tfTok.equals(sinkTok) == true);
      }
     
View Full Code Here


    // and ensure they are the same as the ones we produced in serial fashion.

    for (int i = 0; i < numTestPoints; i++) {
      String term = _TestUtil.randomSimpleString(random);
      TokenStream ts = analyzer.reusableTokenStream("fake", new StringReader(term));
      CharTermAttribute encodedBytes = ts.addAttribute(CharTermAttribute.class);
      ts.reset();
      assertTrue(ts.incrementToken());
      // ensure we make a copy of the actual bytes too
      map.put(term, encodedBytes.toString());
    }
   
    Thread threads[] = new Thread[numThreads];
    for (int i = 0; i < numThreads; i++) {
      threads[i] = new Thread() {
        @Override
        public void run() {
          try {
            for (Map.Entry<String,String> mapping : map.entrySet()) {
              String term = mapping.getKey();
              String expected = mapping.getValue();
              TokenStream ts = analyzer.reusableTokenStream("fake", new StringReader(term));
              CharTermAttribute encodedBytes = ts.addAttribute(CharTermAttribute.class);
              ts.reset();
              assertTrue(ts.incrementToken());
              assertEquals(expected, encodedBytes.toString());
            }
          } catch (IOException e) {
            throw new RuntimeException(e);
          }
        }
View Full Code Here

    throws IOException, InvalidTokenOffsetsException
  {
    ArrayList<TextFragment> docFrags = new ArrayList<TextFragment>();
    StringBuilder newText=new StringBuilder();
   
      CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
      OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
      tokenStream.addAttribute(PositionIncrementAttribute.class);
      tokenStream.reset();
     
    TextFragment currentFrag =  new TextFragment(newText,newText.length(), docFrags.size());
   
    if (fragmentScorer instanceof QueryScorer) {
      ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(maxDocCharsToAnalyze);
    }
   
    TokenStream newStream = fragmentScorer.init(tokenStream);
    if(newStream != null) {
      tokenStream = newStream;
    }
    fragmentScorer.startFragment(currentFrag);
    docFrags.add(currentFrag);

    FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);

    try
    {

      String tokenText;
      int startOffset;
      int endOffset;
      int lastEndOffset = 0;
      textFragmenter.start(text, tokenStream);

      TokenGroup tokenGroup=new TokenGroup(tokenStream);

      for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset()< maxDocCharsToAnalyze);
            next = tokenStream.incrementToken())
      {
        if(  (offsetAtt.endOffset()>text.length())
          ||
          (offsetAtt.startOffset()>text.length())
          )           
        {
          throw new InvalidTokenOffsetsException("Token "+ termAtt.toString()
              +" exceeds length of provided text sized "+text.length());
        }
        if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct()))
        {
          //the current token is distinct from previous tokens -
View Full Code Here

        List<String> tokens = new ArrayList<String>();
        TokenStream stream = null;
        try {
            stream = analyzer.tokenStream(FieldNames.FULLTEXT,
                    new StringReader(text));
            CharTermAttribute termAtt = stream
                    .addAttribute(CharTermAttribute.class);
            OffsetAttribute offsetAtt = stream
                    .addAttribute(OffsetAttribute.class);
            // TypeAttribute type = stream.addAttribute(TypeAttribute.class);

            stream.reset();

            int poz = 0;
            boolean hasFulltextToken = false;
            StringBuilder token = new StringBuilder();
            while (stream.incrementToken()) {
                String term = termAtt.toString();
                int start = offsetAtt.startOffset();
                int end = offsetAtt.endOffset();
                if (start > poz) {
                    for (int i = poz; i < start; i++) {
                        for (char c : fulltextTokens) {
View Full Code Here

        displayTokens(analyzer.tokenStream("contents", new StringReader(text)));
    }

    public static void displayTokens(TokenStream stream) throws IOException {

        CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
        while (stream.incrementToken()) {
            System.out.println("[" + term.toString() + "] ");
        }

    }
View Full Code Here

            throws IOException {

        TokenStream stream = analyzer.tokenStream("contents",
                new StringReader(text));

        CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posIncr = stream
                .addAttribute(PositionIncrementAttribute.class);

        int position = 0;
        while (stream.incrementToken()) {

            int increment = posIncr.getPositionIncrement();
            if (increment > 0) {
                position = position + increment;
                System.out.println();
                System.out.print(position + ":");
            }

            System.out.print("[" + term.toString() + "] ");

        }
        System.out.println();

    }
View Full Code Here

            throws IOException {

        TokenStream stream = analyzer.tokenStream("contents",
                new StringReader(text));

        CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posIncr = stream
                .addAttribute(PositionIncrementAttribute.class);
        OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
        TypeAttribute type = stream.addAttribute(TypeAttribute.class);
        PayloadAttribute payload = stream.addAttribute(PayloadAttribute.class);

        int position = 0;
        while (stream.incrementToken()) {

            int increment = posIncr.getPositionIncrement();
            if (increment > 0) {
                position = position + increment;
                System.out.println();
                System.out.print(position + ":");
            }

            BytesRef pl = payload.getPayload();

            if (pl != null) {
                System.out.print("[" + term.toString() + ":" + offset.startOffset()
                        + "->" + offset.endOffset() + ":" + type.type() + ":"
                        + new String(pl.bytes) + "] ");

            } else {
                System.out.print("[" + term.toString() + ":" + offset.startOffset()
                        + "->" + offset.endOffset() + ":" + type.type() + "] ");

            }
        }
        System.out.println();
View Full Code Here

    config.set("pattern", "\\s+");
    config.set("lowercase", "false");
    CustomPatternAnalyzer a = new CustomPatternAnalyzer(config);
   
    TokenStream tokenStream = a.tokenStream("test", "this is a Text with Whitespaces");
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

    tokenStream.incrementToken();
    String t2 = charTermAttribute.toString();
    tokenStream.incrementToken();
    String t3 = charTermAttribute.toString();
   
    assertEquals("Second Token did not match!", "Text", t2);
    assertEquals("Third Token did not match!", "Whitespaces", t3);
   
  }
View Full Code Here

    config.set("pattern", "\\s+");
    config.set("lowercase", "true");
    CustomPatternAnalyzer a = new CustomPatternAnalyzer(config);
   
    TokenStream tokenStream = a.tokenStream("test", "this is a Text with Whitespaces");
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

    tokenStream.incrementToken();
    String t2 = charTermAttribute.toString();
    tokenStream.incrementToken();
    String t3 = charTermAttribute.toString();
   
    assertEquals("Second Token did not match!", "text", t2);
    assertEquals("Third Token did not match!", "whitespaces", t3);
   
  }
View Full Code Here

    GenericConfiguration config = new GenericConfiguration();
    config.set("pattern", "\\s+");
    CustomPatternAnalyzer a = new CustomPatternAnalyzer(config);
   
    TokenStream tokenStream = a.tokenStream("test", "this is a Text with Whitespaces");
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

    tokenStream.incrementToken();
    String t2 = charTermAttribute.toString();
    tokenStream.incrementToken();
    String t3 = charTermAttribute.toString();
   
    assertEquals("Second Token did not match!", "text", t2);
    assertEquals("Third Token did not match!", "whitespaces", t3);
   
  }
View Full Code Here

TOP

Related Classes of org.apache.lucene.analysis.tokenattributes.CharTermAttribute

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.