Package org.apache.lucene.analysis.standard

Examples of org.apache.lucene.analysis.standard.StandardTokenizer


  }
  @Test
  public void testLuceneStandardTokenizer() throws Exception {
    String[] gold = {"I", "can't", "beleive", "that", "the", "Carolina", "Hurricanes", "won", "the", "2005", "2006", "Stanley", "Cup",};
    StandardTokenizer tokenizer = new StandardTokenizer(Version.LUCENE_36, new StringReader("I can't beleive that the Carolina Hurricanes won the 2005-2006 Stanley Cup."));
    List<String> result = new ArrayList<String>();
    while (tokenizer.incrementToken()) {
      result.add(((CharTermAttribute) tokenizer.getAttribute(CharTermAttribute.class)).toString());
    }
    assertTrue("result Size: " + result.size() + " is not: " + gold.length, result.size() == gold.length);
    int i = 0;
    for (String chunk : result) {
      assertTrue(chunk + " is not equal to " + gold[i], chunk.equals(gold[i]) == true);
View Full Code Here


  @Test
  public void testVikings() throws Exception {
    String[] gold = {"Last", "week", "the", "National", "Football", "League", "crowned", "a", "new", "Super", "Bowl", "Champion",
            "Minnesota", "Vikings", "fans", "will", "take", "little", "solace", "in", "the", "fact", "that", "they",
            "lost", "to", "the", "eventual", "champion", "in", "the", "playoffs"};
    StandardTokenizer tokenizer = new StandardTokenizer(Version.LUCENE_36, new StringReader("Last week the National Football League crowned a new Super Bowl Champion." +
            "  Minnesota Vikings fans will take little solace in the fact that they" +
            " lost to the eventual champion in the playoffs."));
    List<String> result = new ArrayList<String>();
    while (tokenizer.incrementToken()) {
      result.add(((CharTermAttribute) tokenizer.getAttribute(CharTermAttribute.class)).toString());
    }
    assertTrue("result Size: " + result.size() + " is not: " + gold.length, result.size() == gold.length);
    int i = 0;
    for (String chunk : result) {
      System.out.println(chunk);
View Full Code Here

    return maxTokenLength;
  }

  @Override
  protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
    final StandardTokenizer src = new StandardTokenizer(matchVersion, reader);
    src.setMaxTokenLength(maxTokenLength);
    TokenStream tok = new StandardFilter(matchVersion, src);
    tok = new LowerCaseFilter(matchVersion, tok);
    tok = new StopFilter(matchVersion, tok, stopwords);
    return new TokenStreamComponents(src, tok) {
      @Override
      protected void setReader(final Reader reader) throws IOException {
        src.setMaxTokenLength(NoStopWordStandardAnalyzer.this.maxTokenLength);
        super.setReader(reader);
      }
    };
  }
View Full Code Here

   * @return  A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
   *       {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}, and
   *          {@link BrazilianStemFilter}.
   */
  public final TokenStream tokenStream(String fieldName, Reader reader) {
    TokenStream result = new StandardTokenizer( reader );
    result = new LowerCaseFilter( result );
    result = new StandardFilter( result );
    result = new StopFilter( result, stoptable );
    result = new BrazilianStemFilter( result, excltable );
    return result;
View Full Code Here

    public TokenStream reusableTokenStream(String fieldName, Reader reader)
      throws IOException {
      SavedStreams streams = (SavedStreams) getPreviousTokenStream();
      if (streams == null) {
        streams = new SavedStreams();
        streams.source = new StandardTokenizer(reader);
        streams.result = new LowerCaseFilter(streams.source);
        streams.result = new StandardFilter(streams.result);
        streams.result = new StopFilter(streams.result, stoptable);
        streams.result = new BrazilianStemFilter(streams.result, excltable);
        setPreviousTokenStream(streams);
View Full Code Here

   * @return A {@link TokenStream} built from a {@link StandardTokenizer}
   *   filtered with {@link StandardFilter}, {@link StopFilter},
   *   and {@link DutchStemFilter}
   */
  public TokenStream tokenStream(String fieldName, Reader reader) {
    TokenStream result = new StandardTokenizer(reader);
    result = new StandardFilter(result);
    result = new StopFilter(result, stoptable);
    result = new DutchStemFilter(result, excltable, stemdict);
    return result;
  }
View Full Code Here

    }
   
    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
    if (streams == null) {
      streams = new SavedStreams();
      streams.source = new StandardTokenizer(reader);
      streams.result = new StandardFilter(streams.source);
      streams.result = new StopFilter(streams.result, stoptable);
      streams.result = new DutchStemFilter(streams.result, excltable, stemdict);
      setPreviousTokenStream(streams);
    } else {
View Full Code Here

  public void testExceptionFromTokenStream() throws IOException {
    RAMDirectory dir = new MockRAMDirectory();
    IndexWriter writer = new IndexWriter(dir, new Analyzer() {

      public TokenStream tokenStream(String fieldName, Reader reader) {
        return new TokenFilter(new StandardTokenizer(reader)) {
          private int count = 0;

          public boolean incrementToken() throws IOException {
            if (count++ == 5) {
              throw new IOException();
View Full Code Here

      System.out.println("-----Tokens: " + tokCount[k] + "-----");
      for (int i = 0; i < tokCount[k]; i++) {
        buffer.append(English.intToEnglish(i).toUpperCase()).append(' ');
      }
      //make sure we produce the same tokens
      TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))));
      TokenStream sink = teeStream.newSinkTokenStream(new ModuloSinkFilter(100));
      teeStream.consumeAllTokens();
      TokenStream stream = new ModuloTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), 100);
      TermAttribute tfTok = (TermAttribute) stream.addAttribute(TermAttribute.class);
      TermAttribute sinkTok = (TermAttribute) sink.addAttribute(TermAttribute.class);
      for (int i=0; stream.incrementToken(); i++) {
        assertTrue(sink.incrementToken());
        assertTrue(tfTok + " is not equal to " + sinkTok + " at token: " + i, tfTok.equals(sinkTok) == true);
      }
     
      //simulate two fields, each being analyzed once, for 20 documents
      for (int j = 0; j < modCounts.length; j++) {
        int tfPos = 0;
        long start = System.currentTimeMillis();
        for (int i = 0; i < 20; i++) {
          stream = new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString())));
          PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) stream.getAttribute(PositionIncrementAttribute.class);
          while (stream.incrementToken()) {
            tfPos += posIncrAtt.getPositionIncrement();
          }
          stream = new ModuloTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), modCounts[j]);
          posIncrAtt = (PositionIncrementAttribute) stream.getAttribute(PositionIncrementAttribute.class);
          while (stream.incrementToken()) {
            tfPos += posIncrAtt.getPositionIncrement();
          }
        }
        long finish = System.currentTimeMillis();
        System.out.println("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms");
        int sinkPos = 0;
        //simulate one field with one sink
        start = System.currentTimeMillis();
        for (int i = 0; i < 20; i++) {
          teeStream = new TeeSinkTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))));
          sink = teeStream.newSinkTokenStream(new ModuloSinkFilter(modCounts[j]));
          PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) teeStream.getAttribute(PositionIncrementAttribute.class);
          while (teeStream.incrementToken()) {
            sinkPos += posIncrAtt.getPositionIncrement();
          }
View Full Code Here

    char whitespace[] = new char[4094];
    Arrays.fill(whitespace, ' ');
    sb.append(whitespace);
    sb.append("testing 1234");
    String input = sb.toString();
    StandardTokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
    BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
  }
View Full Code Here

TOP

Related Classes of org.apache.lucene.analysis.standard.StandardTokenizer

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.