Examples of org.apache.lucene.analysis.tokenattributes.TermAttribute

Package org.apache.lucene.analysis.tokenattributes

Examples of org.apache.lucene.analysis.tokenattributes.TermAttribute

org.apache.lucene.analysis.tokenattributes.TermAttribute
The term text of a Token.

                      String field,
                      Set<?> stop)
                      throws IOException
  {  
    TokenStream ts = a.tokenStream( field, new StringReader( body));
    TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
    
    BooleanQuery tmp = new BooleanQuery();
    Set<String> already = new HashSet<String>(); // ignore dups
    while (ts.incrementToken()) {
      String word = termAtt.term();
      // ignore opt stop words
      if ( stop != null &&
         stop.contains( word)) continue;
      // ignore dups
      if ( ! already.add( word)) continue;

View Full Code Here

                                Fieldable[] fields = aDoc.getFieldables(FieldNames.PROPERTIES);
                                for (Fieldable field : fields) {


                                    // assume properties fields use SingleTokenStream
                                    TokenStream tokenStream = field.tokenStreamValue();
                                    TermAttribute termAttribute = tokenStream.addAttribute(TermAttribute.class);
                                    PayloadAttribute payloadAttribute = tokenStream.addAttribute(PayloadAttribute.class);
                                    tokenStream.incrementToken();
                                    tokenStream.end();
                                    tokenStream.close();


                                    String value = new String(termAttribute.termBuffer(), 0, termAttribute.termLength());
                                    if (value.startsWith(namePrefix)) {
                                        // extract value
                                        value = value.substring(namePrefix.length());
                                        // create new named value
                                        Path p = getRelativePath(state, propState);
                                        String path = getNamespaceMappings().translatePath(p);
                                        value = FieldNames.createNamedValue(path, value);
                                        termAttribute.setTermBuffer(value);
                                        doc.add(new Field(field.name(),
                                                new SingletonTokenStream(value, (Payload) payloadAttribute.getPayload().clone())));
                                        doc.add(new Field(
                                                FieldNames.AGGREGATED_NODE_UUID,
                                                false,

View Full Code Here

        Reader r = new StringReader(text);
        TokenStream ts = index.getTextAnalyzer().tokenStream("", r);
        try {
            while (ts.incrementToken()) {
                OffsetAttribute offset = ts.getAttribute(OffsetAttribute.class);
                TermAttribute term = ts.getAttribute(TermAttribute.class);
                String termText = term.term();
                TermVectorOffsetInfo[] info = termMap.get(termText);
                if (info == null) {
                    info = new TermVectorOffsetInfo[1];
                } else {
                    TermVectorOffsetInfo[] tmp = info;

View Full Code Here

    rules.add("a b c,d");
    SynonymMap synMap = new SynonymMap(true);
    SynonymFilterFactory.parseRules(rules, synMap, "=>", ",", true, null);


    SynonymFilter ts = new SynonymFilter(new WhitespaceTokenizer(new StringReader("a e")), synMap);
    TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);


    ts.reset();
    List<String> tokens = new ArrayList<String>();
    while (ts.incrementToken()) tokens.add(termAtt.term());


    // This fails because ["e","e"] is the value of the token stream
    Assert.assertEquals(Arrays.asList("a", "e"), tokens);
  }

View Full Code Here


  private void assertAnalyzesTo(Analyzer a, String input, String[] output,
      int startOffsets[], int endOffsets[], int posIncs[]) throws Exception {


    TokenStream ts = a.tokenStream("dummy", new StringReader(input));
    TermAttribute termAtt = (TermAttribute) ts
        .getAttribute(TermAttribute.class);
    OffsetAttribute offsetAtt = (OffsetAttribute) ts
        .getAttribute(OffsetAttribute.class);
    PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) ts
        .getAttribute(PositionIncrementAttribute.class);
    for (int i = 0; i < output.length; i++) {
      assertTrue(ts.incrementToken());
      assertEquals(output[i], termAtt.term());
      assertEquals(startOffsets[i], offsetAtt.startOffset());
      assertEquals(endOffsets[i], offsetAtt.endOffset());
      assertEquals(posIncs[i], posIncAtt.getPositionIncrement());
    }
    assertFalse(ts.incrementToken());

View Full Code Here

  public void testReset() throws Exception {
    final String input = "How the s a brown s cow d like A B thing?";
    WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
    CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
    
    TermAttribute term = (TermAttribute) cgf.addAttribute(TermAttribute.class);
    assertTrue(cgf.incrementToken());
    assertEquals("How", term.term());
    assertTrue(cgf.incrementToken());
    assertEquals("How_the", term.term());
    assertTrue(cgf.incrementToken());
    assertEquals("the", term.term());
    assertTrue(cgf.incrementToken());
    assertEquals("the_s", term.term());
    
    wt.reset(new StringReader(input));
    cgf.reset();
    assertTrue(cgf.incrementToken());
    assertEquals("How", term.term());
  }

View Full Code Here

  
  public void testReset() throws Exception {
    final String input = "How now A B brown A cow B like A B thing?";
    Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader(input));
    TokenStream ts = new AB_AAB_Stream(tokenizer);
    TermAttribute term = (TermAttribute) ts.addAttribute(TermAttribute.class);
    assertTrue(ts.incrementToken());
    assertEquals("How", term.term());
    assertTrue(ts.incrementToken());
    assertEquals("now", term.term());
    assertTrue(ts.incrementToken());
    assertEquals("A", term.term());
    // reset back to input, 
    // if reset() does not work correctly then previous buffered tokens will remain 
    tokenizer.reset(new StringReader(input));
    ts.reset();
    assertTrue(ts.incrementToken());
    assertEquals("How", term.term());
  }

View Full Code Here

    final String input = "How the s a brown s cow d like A B thing?";
    WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
    CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
    CommonGramsQueryFilter nsf = new CommonGramsQueryFilter(cgf);
    
    TermAttribute term = (TermAttribute) wt.addAttribute(TermAttribute.class);
    assertTrue(nsf.incrementToken());
    assertEquals("How_the", term.term());
    assertTrue(nsf.incrementToken());
    assertEquals("the_s", term.term());
    
    wt.reset(new StringReader(input));
    nsf.reset();
    assertTrue(nsf.incrementToken());
    assertEquals("How_the", term.term());
  }

View Full Code Here

                    new Token(b, 0, b.length, 6, 10),
                    new Token(ccc, 0, ccc.length, 11, 15),
                    new Token(whitespace, 0, whitespace.length, 16, 20),
                    new Token(empty, 0, empty.length, 21, 21)), false);


    TermAttribute token;
    assertTrue(ts.incrementToken());
    token = (TermAttribute) ts.getAttribute(TermAttribute.class);
    assertEquals("a", new String(token.termBuffer(), 0, token.termLength()));
    assertTrue(ts.incrementToken());
    assertEquals("b", new String(token.termBuffer(), 0, token.termLength()));
    assertTrue(ts.incrementToken());
    assertEquals("cCc", new String(token.termBuffer(), 0, token.termLength()));
    assertTrue(ts.incrementToken());
    assertEquals("", new String(token.termBuffer(), 0, token.termLength()));
    assertTrue(ts.incrementToken());
    assertEquals("", new String(token.termBuffer(), 0, token.termLength()));
    assertFalse(ts.incrementToken());


    a = " a".toCharArray();
    b = "b ".toCharArray();
    ccc = " c ".toCharArray();

View Full Code Here

  private Analyzer analyzer;


  @Override
  protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
    TokenStream stream = analyzer.tokenStream(key.toString(), new StringReader(value.toString()));
    TermAttribute termAtt = stream.addAttribute(TermAttribute.class);
    StringTuple document = new StringTuple();
    while (stream.incrementToken()) {
      if (termAtt.termLength() > 0) {
        document.add(new String(termAtt.termBuffer(), 0, termAtt.termLength()));
      }
    }
    context.write(key, document);
  }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.lucene.analysis.tokenattributes.TermAttribute

com.alimama.mdrill.index.IndexReducer

com.flaptor.indextank.query.IndexEngineParser

com.jayway.mongodb.AnalyzedDBObject

com.senseidb.search.query.TextQueryConstructor

edu.wiki.search.ESASearcher

it.unibz.instasearch.indexing.StorageIndexer

lucli.LuceneMethods

mia.classifier.ch14.TokenizingAndVectorizingText

mia.clustering.ch09.MyAnalyzer

mia.clustering.ch12.TwitterAnalyzer

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.