Package org.apache.lucene.analysis.standard

Examples of org.apache.lucene.analysis.standard.ClassicAnalyzer

ClassicAnalyzer was named StandardAnalyzer in Lucene versions prior to 3.1. As of 3.1, {@link StandardAnalyzer} implements Unicode text segmentation,as specified by UAX#29.

   * Make sure we skip wicked long terms.
  */
  public void testWickedLongTerm() throws IOException {
    RAMDirectory dir = new RAMDirectory();
    IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
      TEST_VERSION_CURRENT, new ClassicAnalyzer(TEST_VERSION_CURRENT)));

    char[] chars = new char[IndexWriter.MAX_TERM_LENGTH];
    Arrays.fill(chars, 'x');
    Document doc = new Document();
    final String bigTerm = new String(chars);

    // This produces a too-long term:
    String contents = "abc xyz x" + bigTerm + " another term";
    doc.add(new TextField("content", contents, Field.Store.NO));
    writer.addDocument(doc);

    // Make sure we can add another normal document
    doc = new Document();
    doc.add(new TextField("content", "abc bbb ccc", Field.Store.NO));
    writer.addDocument(doc);
    writer.close();

    IndexReader reader = IndexReader.open(dir);

    // Make sure all terms < max size were indexed
    assertEquals(2, reader.docFreq(new Term("content", "abc")));
    assertEquals(1, reader.docFreq(new Term("content", "bbb")));
    assertEquals(1, reader.docFreq(new Term("content", "term")));
    assertEquals(1, reader.docFreq(new Term("content", "another")));

    // Make sure position is still incremented when
    // massive term is skipped:
    DocsAndPositionsEnum tps = MultiFields.getTermPositionsEnum(reader,
                                                                MultiFields.getLiveDocs(reader),
                                                                "content",
                                                                new BytesRef("another"));
    assertTrue(tps.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
    assertEquals(1, tps.freq());
    assertEquals(3, tps.nextPosition());

    // Make sure the doc that has the massive term is in
    // the index:
    assertEquals("document with wicked long term should is not in the index!", 2, reader.numDocs());

    reader.close();

    // Make sure we can add a document with exactly the
    // maximum length term, and search on that term:
    doc = new Document();
    doc.add(new TextField("content", bigTerm, Field.Store.NO));
    ClassicAnalyzer sa = new ClassicAnalyzer(TEST_VERSION_CURRENT);
    sa.setMaxTokenLength(100000);
    writer  = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, sa));
    writer.addDocument(doc);
    writer.close();
    reader = IndexReader.open(dir);
    assertEquals(1, reader.docFreq(new Term("content", bigTerm)));
View Full Code Here


    dir.close();
  }
 
  /** blast some random strings through the analyzer */
  public void testRandomStrings() throws Exception {
    checkRandomData(random(), new ClassicAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
  }
View Full Code Here

  }
 
  /** blast some random large strings through the analyzer */
  public void testRandomHugeStrings() throws Exception {
    Random random = random();
    checkRandomData(random, new ClassicAnalyzer(TEST_VERSION_CURRENT), 100*RANDOM_MULTIPLIER, 8192);
  }
View Full Code Here

    return new LuceneUsersController();
  }
 
  private Analyzer getAnalyzer() throws Exception
  {
    return new ClassicAnalyzer(Version.LUCENE_34);
    //return new KeywordAnalyzer();
    //return new WhitespaceAnalyzer(Version.LUCENE_34);
    //return new StandardAnalyzer(Version.LUCENE_34);
  }
View Full Code Here

public class TestClassicAnalyzer extends BaseTokenStreamTestCase {

  private Analyzer a = new ClassicAnalyzer(TEST_VERSION_CURRENT);

  public void testMaxTermLength() throws Exception {
    ClassicAnalyzer sa = new ClassicAnalyzer(TEST_VERSION_CURRENT);
    sa.setMaxTokenLength(5);
    assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "xy", "z"});
  }
View Full Code Here

    sa.setMaxTokenLength(5);
    assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "xy", "z"});
  }

  public void testMaxTermLength2() throws Exception {
    ClassicAnalyzer sa = new ClassicAnalyzer(TEST_VERSION_CURRENT);
    assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "toolong", "xy", "z"});
    sa.setMaxTokenLength(5);
   
    assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "xy", "z"}, new int[]{1, 1, 2, 1});
  }
View Full Code Here

    assertAnalyzesTo(a, "Excite@Home", new String[]{"excite@home"});
  }

  public void testLucene1140() throws Exception {
    try {
      ClassicAnalyzer analyzer = new ClassicAnalyzer(TEST_VERSION_CURRENT);
      assertAnalyzesTo(analyzer, "www.nutch.org.", new String[]{ "www.nutch.org" }, new String[] { "<HOST>" });
    } catch (NullPointerException e) {
      fail("Should not throw an NPE and it did");
    }
View Full Code Here

  }

  public void testDomainNames() throws Exception {
    // Current lucene should not show the bug
    ClassicAnalyzer a2 = new ClassicAnalyzer(TEST_VERSION_CURRENT);

    // domain names
    assertAnalyzesTo(a2, "www.nutch.org", new String[]{"www.nutch.org"});
    //Notice the trailing .  See https://issues.apache.org/jira/browse/LUCENE-1068.
    // the following should be recognized as HOST:
    assertAnalyzesTo(a2, "www.nutch.org.", new String[]{ "www.nutch.org" }, new String[] { "<HOST>" });

    // 2.3 should show the bug
    a2 = new ClassicAnalyzer(org.apache.lucene.util.Version.LUCENE_23);
    assertAnalyzesTo(a2, "www.nutch.org.", new String[]{ "wwwnutchorg" }, new String[] { "<ACRONYM>" });

    // 2.4 should not show the bug
    a2 = new ClassicAnalyzer(Version.LUCENE_24);
    assertAnalyzesTo(a2, "www.nutch.org.", new String[]{ "www.nutch.org" }, new String[] { "<HOST>" });
  }
View Full Code Here

                    "<ALPHANUM>", "<NUM>", "<HOST>", "<NUM>", "<ALPHANUM>",
                    "<ALPHANUM>", "<HOST>"});
  }

  public void testJava14BWCompatibility() throws Exception {
    ClassicAnalyzer sa = new ClassicAnalyzer(Version.LUCENE_30);
    assertAnalyzesTo(sa, "test\u02C6test", new String[] { "test", "test" });
  }
View Full Code Here

   * Make sure we skip wicked long terms.
   */
  public void testWickedLongTerm() throws IOException {
    RAMDirectory dir = new RAMDirectory();
    IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(
      TEST_VERSION_CURRENT, new ClassicAnalyzer(TEST_VERSION_CURRENT)));

    char[] chars = new char[IndexWriter.MAX_TERM_LENGTH];
    Arrays.fill(chars, 'x');
    Document doc = new Document();
    final String bigTerm = new String(chars);

    // This produces a too-long term:
    String contents = "abc xyz x" + bigTerm + " another term";
    doc.add(new Field("content", contents, Field.Store.NO, Field.Index.ANALYZED));
    writer.addDocument(doc);

    // Make sure we can add another normal document
    doc = new Document();
    doc.add(new Field("content", "abc bbb ccc", Field.Store.NO, Field.Index.ANALYZED));
    writer.addDocument(doc);
    writer.close();

    IndexReader reader = IndexReader.open(dir, true);

    // Make sure all terms < max size were indexed
    assertEquals(2, reader.docFreq(new Term("content", "abc")));
    assertEquals(1, reader.docFreq(new Term("content", "bbb")));
    assertEquals(1, reader.docFreq(new Term("content", "term")));
    assertEquals(1, reader.docFreq(new Term("content", "another")));

    // Make sure position is still incremented when
    // massive term is skipped:
    TermPositions tps = reader.termPositions(new Term("content", "another"));
    assertTrue(tps.next());
    assertEquals(1, tps.freq());
    assertEquals(3, tps.nextPosition());

    // Make sure the doc that has the massive term is in
    // the index:
    assertEquals("document with wicked long term should is not in the index!", 2, reader.numDocs());

    reader.close();

    // Make sure we can add a document with exactly the
    // maximum length term, and search on that term:
    doc = new Document();
    doc.add(new Field("content", bigTerm, Field.Store.NO, Field.Index.ANALYZED));
    ClassicAnalyzer sa = new ClassicAnalyzer(TEST_VERSION_CURRENT);
    sa.setMaxTokenLength(100000);
    writer  = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, sa));
    writer.addDocument(doc);
    writer.close();
    reader = IndexReader.open(dir, true);
    assertEquals(1, reader.docFreq(new Term("content", bigTerm)));
View Full Code Here

TOP

Related Classes of org.apache.lucene.analysis.standard.ClassicAnalyzer

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.