Examples of org.apache.lucene.analysis.Analyzer

org.apache.lucene.analysis.Analyzer
An Analyzer builds TokenStreams, which analyze text. It thus represents a policy for extracting index terms from text.
Typical implementations first build a Tokenizer, which breaks the stream of characters from the Reader into raw Tokens. One or more TokenFilters may then be applied to the output of the Tokenizer.


  void addDocs(Directory dir, final int ndocs, String field, final String val, final int maxTF, final float percentDocs) throws IOException {
    final Random random = newRandom();
    final RepeatingTokenStream ts = new RepeatingTokenStream(val);


    Analyzer analyzer = new Analyzer() {
      @Override
      public TokenStream tokenStream(String fieldName, Reader reader) {
        if (random.nextFloat() < percentDocs) ts.num = random.nextInt(maxTF)+1;
        else ts.num=0;
        return ts;

View Full Code Here


  // Tests whether the DocumentWriter correctly enable the
  // omitTermFreqAndPositions bit in the FieldInfo
  public void testOmitTermFreqAndPositions() throws Exception {
    Directory ram = new MockRAMDirectory();
    Analyzer analyzer = new StandardAnalyzer(org.apache.lucene.util.Version.LUCENE_CURRENT);
    IndexWriter writer = new IndexWriter(ram, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
    Document d = new Document();
        
    // this field will have Tf
    Field f1 = new Field("f1", "This field has term freqs", Field.Store.NO, Field.Index.ANALYZED);

View Full Code Here

 
  // Tests whether merging of docs that have different
  // omitTermFreqAndPositions for the same field works
  public void testMixedMerge() throws Exception {
    Directory ram = new MockRAMDirectory();
    Analyzer analyzer = new StandardAnalyzer(org.apache.lucene.util.Version.LUCENE_CURRENT);
    IndexWriter writer = new IndexWriter(ram, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
    writer.setMaxBufferedDocs(3);
    writer.setMergeFactor(2);
    Document d = new Document();

View Full Code Here

  // Make sure first adding docs that do not omitTermFreqAndPositions for
  // field X, then adding docs that do omitTermFreqAndPositions for that same
  // field, 
  public void testMixedRAM() throws Exception {
    Directory ram = new MockRAMDirectory();
    Analyzer analyzer = new StandardAnalyzer(org.apache.lucene.util.Version.LUCENE_CURRENT);
    IndexWriter writer = new IndexWriter(ram, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
    writer.setMaxBufferedDocs(10);
    writer.setMergeFactor(2);
    Document d = new Document();

View Full Code Here

  }


  // Verifies no *.prx exists when all fields omit term freq:
  public void testNoPrxFile() throws Throwable {
    Directory ram = new MockRAMDirectory();
    Analyzer analyzer = new StandardAnalyzer(org.apache.lucene.util.Version.LUCENE_CURRENT);
    IndexWriter writer = new IndexWriter(ram, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
    writer.setMaxBufferedDocs(3);
    writer.setMergeFactor(2);
    writer.setUseCompoundFile(false);
    Document d = new Document();

View Full Code Here

  }
 
  // Test scores with one field with Term Freqs and one without, otherwise with equal content 
  public void testBasic() throws Exception {
    Directory dir = new MockRAMDirectory();  
    Analyzer analyzer = new StandardAnalyzer(org.apache.lucene.util.Version.LUCENE_CURRENT);
    IndexWriter writer = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
    writer.setMergeFactor(2);
    writer.setMaxBufferedDocs(2);
    writer.setSimilarity(new SimpleSimilarity());

View Full Code Here

  }
  */
  
  public void testWithCachingFilter() throws Exception {
    Directory dir = new RAMDirectory();
    Analyzer analyzer = new WhitespaceAnalyzer();
  
    IndexWriter writer = new IndexWriter(dir, analyzer, true, MaxFieldLength.LIMITED);
    writer.close();
  
    Searcher searcher = new IndexSearcher(dir, true);

View Full Code Here

    qp.setDefaultOperator(QueryParser.OR_OPERATOR);
    assertEquals(QueryParser.OR_OPERATOR, qp.getDefaultOperator());
  }


  public void testPunct() throws Exception {
    Analyzer a = new WhitespaceAnalyzer();
    assertQueryEquals("a&b", a, "a&b");
    assertQueryEquals("a&&b", a, "a&&b");
    assertQueryEquals(".NET", a, ".NET");
  }

View Full Code Here

// The numbers go away because SimpleAnalzyer ignores them
    assertQueryEquals("3", null, "");
    assertQueryEquals("term 1.0 1 2", null, "term");
    assertQueryEquals("term term1 term2", null, "term term term");


    Analyzer a = new StandardAnalyzer(org.apache.lucene.util.Version.LUCENE_CURRENT);
    assertQueryEquals("3", a, "3");
    assertQueryEquals("term 1.0 1 2", a, "term 1.0 1 2");
    assertQueryEquals("term term1 term2", a, "term term1 term2");
  }

View Full Code Here

    assertQueryEquals(qp, field, field + ":{" + escapeDateString(startDate) + " TO " + escapeDateString(endDate) + "}",
               "{" + getDate(startDate, resolution) + " TO " + getDate(endDate, resolution) + "}");
  }


  public void testEscaped() throws Exception {
    Analyzer a = new WhitespaceAnalyzer();
    
    /*assertQueryEquals("\\[brackets", a, "\\[brackets");
    assertQueryEquals("\\[brackets", null, "brackets");
    assertQueryEquals("\\\\", a, "\\\\");
    assertQueryEquals("\\+blah", a, "\\+blah");

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.lucene.analysis.Analyzer

$.IndexAndSearchTest

com.browseengine.bobo.test.BoboTestCase

com.gentics.cr.lucene.indexer.index.LuceneAnalyzerFactory

it.eng.spagobi.commons.utilities.indexing.LuceneSearcher

org.apache.jackrabbit.core.query.lucene.JackrabbitAnalyzer

org.apache.jena.query.text.assembler.EntityDefinitionAssembler

org.apache.jena.query.text.TextIndexLucene

org.apache.lenya.cms.search.usecases.Search

org.apache.lucene.analysis.cjk.TestCJKTokenizer

org.apache.lucene.analysis.compound.TestCompoundWordTokenFilter

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.