Examples of WhitespaceTokenizer


Examples of eu.stratosphere.util.SimpleStringUtils.WhitespaceTokenizer

  }
 
  @Test
  public void testTokenizerOnStringWithoutNexToken() {
    StringValue testString = new StringValue("test");
    SimpleStringUtils.WhitespaceTokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setStringToTokenize(testString);
    //first token
    tokenizer.next(testString);
    //next token is not exist
    assertFalse(tokenizer.next(testString));
  }
View Full Code Here

Examples of org.apache.flink.util.SimpleStringUtils.WhitespaceTokenizer

  }
 
  @Test
  public void testTokenizerOnStringWithoutNexToken() {
    StringValue testString = new StringValue("test");
    SimpleStringUtils.WhitespaceTokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setStringToTokenize(testString);
    //first token
    tokenizer.next(testString);
    //next token is not exist
    assertFalse(tokenizer.next(testString));
  }
View Full Code Here

Examples of org.apache.lucene.analysis.WhitespaceTokenizer

        tokenBuilder.append((String) groupIter.next());
        tokenBuilder.append(" ");
      }
   
      //doc.add(new Field("groups", new IteratorTokenStream(groupIter)));
      doc.add(new Field("groups", new WhitespaceTokenizer(new StringReader(tokenBuilder.toString()))));
    }

    // Add the URL of the document
    doc.add(new Field("url", url, Field.Store.YES, Field.Index.NOT_ANALYZED));
   
    // Add the file name (without protocol, drive-letter and path)
    String filenameWithVariants = RegainToolkit.urlToWhitespacedFileName(url);
    doc.add(new Field("filename", new WhitespaceTokenizer(new StringReader(filenameWithVariants))));
    PathFilenamePair pfPair = RegainToolkit.fragmentUrl(url);

    // Add the filename field for sorting
    doc.add(new Field("filename_sort", pfPair.getFilename(), Field.Store.YES, Field.Index.NOT_ANALYZED));
View Full Code Here

Examples of org.apache.lucene.analysis.WhitespaceTokenizer

   * @see org.apache.lucene.analysis.KeywordAnalyzer#tokenStream(java.lang.String,
   * java.io.Reader)
   */
  @Override
  public TokenStream tokenStream(String fieldName, Reader reader) {
    TokenStream result = new WhitespaceTokenizer(reader);
    result = new LowerCaseFilter(result);

    return result;
  }
View Full Code Here

Examples of org.apache.lucene.analysis.WhitespaceTokenizer

  private TokenStream luceneTokenStream(String text, boolean letters, boolean toLowerCase, Set stopWords) {
    TokenStream stream;
    if (letters)
      stream = new LetterTokenizer(new StringReader(text));
    else
      stream = new WhitespaceTokenizer(new StringReader(text));
    if (toLowerCasestream = new LowerCaseFilter(stream);
    if (stopWords != null) stream = new StopFilter(stream, stopWords);
    return stream;           
  }
View Full Code Here

Examples of org.apache.lucene.analysis.WhitespaceTokenizer

  public void testPositionIncrementGap() throws IOException {
    Analyzer analyzer = new Analyzer() {
      @Override
      public TokenStream tokenStream(String fieldName, Reader reader) {
        return new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
      }

      @Override
      public int getPositionIncrementGap(String fieldName) {
        return 500;
View Full Code Here

Examples of org.apache.lucene.analysis.WhitespaceTokenizer

  public void testTokenReuse() throws IOException {
    Analyzer analyzer = new Analyzer() {
      @Override
      public TokenStream tokenStream(String fieldName, Reader reader) {
        return new TokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader)) {
          boolean first=true;
          AttributeSource.State state;

          @Override
          public boolean incrementToken() throws IOException {
View Full Code Here

Examples of org.apache.lucene.analysis.WhitespaceTokenizer

      super(matchVersion, new WhitespaceAnalyzer());
    }
   
    @Override
    public TokenStream tokenStream(String fieldName, Reader reader) {
      return new WhitespaceTokenizer(reader);
    }   
View Full Code Here

Examples of org.apache.lucene.analysis.WhitespaceTokenizer

  private class NonreusableAnalyzer extends Analyzer {
    int invocationCount = 0;
    @Override
    public TokenStream tokenStream(String fieldName, Reader reader) {
      if (++invocationCount % 2 == 0)
        return new WhitespaceTokenizer(reader);
      else
        return new LetterTokenizer(reader);
    }
View Full Code Here

Examples of org.apache.lucene.analysis.WhitespaceTokenizer

    public DutchSubclassAnalyzer(Version matchVersion) {
      super(matchVersion);
    }
    @Override
    public TokenStream tokenStream(String fieldName, Reader reader) {
      return new WhitespaceTokenizer(reader);
    }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.